@huggingface/transformers 4.0.0-next.6 → 4.0.0-next.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (194) hide show
  1. package/README.md +16 -2
  2. package/dist/ort-wasm-simd-threaded.jsep.mjs +24 -24
  3. package/dist/transformers.js +2255 -931
  4. package/dist/transformers.min.js +19 -19
  5. package/dist/transformers.node.cjs +2300 -934
  6. package/dist/transformers.node.min.cjs +20 -20
  7. package/dist/transformers.node.min.mjs +20 -20
  8. package/dist/transformers.node.mjs +2336 -1012
  9. package/dist/transformers.web.js +2327 -1003
  10. package/dist/transformers.web.min.js +17 -17
  11. package/package.json +4 -4
  12. package/src/cache_utils.js +62 -0
  13. package/src/configs.js +45 -24
  14. package/src/env.js +8 -1
  15. package/src/image_processors_utils.js +27 -17
  16. package/src/models/chatterbox/modeling_chatterbox.js +1 -1
  17. package/src/models/chmv2/image_processing_chmv2.js +3 -0
  18. package/src/models/chmv2/modeling_chmv2.js +4 -0
  19. package/src/models/deepseek_v3/modeling_deepseek_v3.js +5 -0
  20. package/src/models/detr/image_processing_detr.js +1 -1
  21. package/src/models/eurobert/modeling_eurobert.js +41 -0
  22. package/src/models/feature_extractors.js +2 -0
  23. package/src/models/gemma3n/modeling_gemma3n.js +2 -0
  24. package/src/models/glm46v/image_processing_glm46v.js +12 -0
  25. package/src/models/glm46v/processing_glm46v.js +5 -0
  26. package/src/models/glm_moe_dsa/modeling_glm_moe_dsa.js +5 -0
  27. package/src/models/glm_ocr/modeling_glm_ocr.js +78 -0
  28. package/src/models/granite_speech/feature_extraction_granite_speech.js +58 -0
  29. package/src/models/granite_speech/modeling_granite_speech.js +5 -0
  30. package/src/models/granite_speech/processing_granite_speech.js +62 -0
  31. package/src/models/grounding_dino/image_processing_grounding_dino.js +1 -1
  32. package/src/models/idefics3/modeling_idefics3.js +5 -32
  33. package/src/models/image_processors.js +3 -0
  34. package/src/models/lfm2_vl/image_processing_lfm2_vl.js +305 -0
  35. package/src/models/lfm2_vl/modeling_lfm2_vl.js +13 -0
  36. package/src/models/lfm2_vl/processing_lfm2_vl.js +77 -0
  37. package/src/models/lighton_ocr/modeling_lighton_ocr.js +3 -0
  38. package/src/models/llava/modeling_llava.js +1 -1
  39. package/src/models/mistral3/modeling_mistral3.js +2 -2
  40. package/src/models/mistral4/modeling_mistral4.js +5 -0
  41. package/src/models/modeling_utils.js +224 -308
  42. package/src/models/models.js +14 -1
  43. package/src/models/nemotron_h/modeling_nemotron_h.js +5 -0
  44. package/src/models/paligemma/modeling_paligemma.js +2 -25
  45. package/src/models/processors.js +4 -0
  46. package/src/models/qwen2_5_vl/modeling_qwen2_5_vl.js +5 -1
  47. package/src/models/qwen2_vl/image_processing_qwen2_vl.js +1 -41
  48. package/src/models/qwen2_vl/modeling_qwen2_vl.js +194 -143
  49. package/src/models/qwen2_vl/processing_qwen2_vl.js +5 -4
  50. package/src/models/qwen3_5/modeling_qwen3_5.js +1 -0
  51. package/src/models/qwen3_5_moe/modeling_qwen3_5_moe.js +2 -1
  52. package/src/models/qwen3_vl/modeling_qwen3_vl.js +2 -1
  53. package/src/models/qwen3_vl_moe/modeling_qwen3_vl_moe.js +2 -1
  54. package/src/models/registry.js +42 -0
  55. package/src/models/sam/image_processing_sam.js +1 -1
  56. package/src/models/session.js +17 -6
  57. package/src/models/smolvlm/modeling_smolvlm.js +7 -0
  58. package/src/models/solar_open/modeling_solar_open.js +5 -0
  59. package/src/models/ultravox/modeling_ultravox.js +1 -3
  60. package/src/models/voxtral/modeling_voxtral.js +3 -0
  61. package/src/models/voxtral_realtime/feature_extraction_voxtral_realtime.js +71 -0
  62. package/src/models/voxtral_realtime/modeling_voxtral_realtime.js +239 -0
  63. package/src/models/voxtral_realtime/processing_voxtral_realtime.js +113 -0
  64. package/src/models/whisper/feature_extraction_whisper.js +2 -12
  65. package/src/pipelines.js +1 -0
  66. package/src/transformers.js +2 -0
  67. package/src/utils/audio.js +18 -2
  68. package/src/utils/cache/CrossOriginStorageCache.js +251 -0
  69. package/src/utils/cache/cross-origin-storage.d.ts +38 -0
  70. package/src/utils/cache.js +5 -0
  71. package/src/utils/hub.js +4 -1
  72. package/src/utils/lru_cache.js +67 -0
  73. package/src/utils/memoize_promise.js +45 -0
  74. package/src/utils/model_registry/get_file_metadata.js +15 -2
  75. package/src/utils/model_registry/get_model_files.js +52 -78
  76. package/src/utils/tensor.js +18 -2
  77. package/types/cache_utils.d.ts +29 -0
  78. package/types/cache_utils.d.ts.map +1 -0
  79. package/types/configs.d.ts.map +1 -1
  80. package/types/env.d.ts +8 -0
  81. package/types/env.d.ts.map +1 -1
  82. package/types/image_processors_utils.d.ts +18 -1
  83. package/types/image_processors_utils.d.ts.map +1 -1
  84. package/types/models/{ast/modeling_ast.d.ts → audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.d.ts} +1 -1
  85. package/types/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.d.ts.map +1 -0
  86. package/types/models/chmv2/image_processing_chmv2.d.ts +4 -0
  87. package/types/models/chmv2/image_processing_chmv2.d.ts.map +1 -0
  88. package/types/models/chmv2/modeling_chmv2.d.ts +6 -0
  89. package/types/models/chmv2/modeling_chmv2.d.ts.map +1 -0
  90. package/types/models/deepseek_v3/modeling_deepseek_v3.d.ts +8 -0
  91. package/types/models/deepseek_v3/modeling_deepseek_v3.d.ts.map +1 -0
  92. package/types/models/detr/image_processing_detr.d.ts +1 -1
  93. package/types/models/eurobert/modeling_eurobert.d.ts +36 -0
  94. package/types/models/eurobert/modeling_eurobert.d.ts.map +1 -0
  95. package/types/models/feature_extractors.d.ts +2 -0
  96. package/types/models/gemma3n/modeling_gemma3n.d.ts +2 -0
  97. package/types/models/gemma3n/modeling_gemma3n.d.ts.map +1 -1
  98. package/types/models/glm46v/image_processing_glm46v.d.ts +4 -0
  99. package/types/models/glm46v/image_processing_glm46v.d.ts.map +1 -0
  100. package/types/models/glm46v/processing_glm46v.d.ts +4 -0
  101. package/types/models/glm46v/processing_glm46v.d.ts.map +1 -0
  102. package/types/models/glm_moe_dsa/modeling_glm_moe_dsa.d.ts +8 -0
  103. package/types/models/glm_moe_dsa/modeling_glm_moe_dsa.d.ts.map +1 -0
  104. package/types/models/glm_ocr/modeling_glm_ocr.d.ts +26 -0
  105. package/types/models/glm_ocr/modeling_glm_ocr.d.ts.map +1 -0
  106. package/types/models/granite_speech/feature_extraction_granite_speech.d.ts +16 -0
  107. package/types/models/granite_speech/feature_extraction_granite_speech.d.ts.map +1 -0
  108. package/types/models/granite_speech/modeling_granite_speech.d.ts +4 -0
  109. package/types/models/granite_speech/modeling_granite_speech.d.ts.map +1 -0
  110. package/types/models/granite_speech/processing_granite_speech.d.ts +19 -0
  111. package/types/models/granite_speech/processing_granite_speech.d.ts.map +1 -0
  112. package/types/models/grounding_dino/image_processing_grounding_dino.d.ts +1 -1
  113. package/types/models/idefics3/modeling_idefics3.d.ts +2 -18
  114. package/types/models/idefics3/modeling_idefics3.d.ts.map +1 -1
  115. package/types/models/image_processors.d.ts +3 -0
  116. package/types/models/lfm2_vl/image_processing_lfm2_vl.d.ts +41 -0
  117. package/types/models/lfm2_vl/image_processing_lfm2_vl.d.ts.map +1 -0
  118. package/types/models/lfm2_vl/modeling_lfm2_vl.d.ts +4 -0
  119. package/types/models/lfm2_vl/modeling_lfm2_vl.d.ts.map +1 -0
  120. package/types/models/lfm2_vl/processing_lfm2_vl.d.ts +18 -0
  121. package/types/models/lfm2_vl/processing_lfm2_vl.d.ts.map +1 -0
  122. package/types/models/lighton_ocr/modeling_lighton_ocr.d.ts +4 -0
  123. package/types/models/lighton_ocr/modeling_lighton_ocr.d.ts.map +1 -0
  124. package/types/models/mistral3/modeling_mistral3.d.ts +2 -2
  125. package/types/models/mistral3/modeling_mistral3.d.ts.map +1 -1
  126. package/types/models/mistral4/modeling_mistral4.d.ts +8 -0
  127. package/types/models/mistral4/modeling_mistral4.d.ts.map +1 -0
  128. package/types/models/modeling_utils.d.ts +44 -35
  129. package/types/models/modeling_utils.d.ts.map +1 -1
  130. package/types/models/models.d.ts +14 -1
  131. package/types/models/nemotron_h/modeling_nemotron_h.d.ts +8 -0
  132. package/types/models/nemotron_h/modeling_nemotron_h.d.ts.map +1 -0
  133. package/types/models/paligemma/modeling_paligemma.d.ts +2 -8
  134. package/types/models/paligemma/modeling_paligemma.d.ts.map +1 -1
  135. package/types/models/processors.d.ts +4 -0
  136. package/types/models/qwen2_5_vl/modeling_qwen2_5_vl.d.ts +3 -0
  137. package/types/models/qwen2_5_vl/modeling_qwen2_5_vl.d.ts.map +1 -1
  138. package/types/models/qwen2_vl/image_processing_qwen2_vl.d.ts.map +1 -1
  139. package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts +43 -6
  140. package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts.map +1 -1
  141. package/types/models/qwen2_vl/processing_qwen2_vl.d.ts +1 -0
  142. package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -1
  143. package/types/models/qwen3_5/modeling_qwen3_5.d.ts +2 -0
  144. package/types/models/qwen3_5/modeling_qwen3_5.d.ts.map +1 -1
  145. package/types/models/qwen3_5_moe/modeling_qwen3_5_moe.d.ts +3 -0
  146. package/types/models/qwen3_5_moe/modeling_qwen3_5_moe.d.ts.map +1 -1
  147. package/types/models/qwen3_vl/modeling_qwen3_vl.d.ts +3 -0
  148. package/types/models/qwen3_vl/modeling_qwen3_vl.d.ts.map +1 -1
  149. package/types/models/qwen3_vl_moe/modeling_qwen3_vl_moe.d.ts +3 -0
  150. package/types/models/qwen3_vl_moe/modeling_qwen3_vl_moe.d.ts.map +1 -1
  151. package/types/models/registry.d.ts.map +1 -1
  152. package/types/models/sam/image_processing_sam.d.ts +1 -1
  153. package/types/models/session.d.ts +3 -2
  154. package/types/models/session.d.ts.map +1 -1
  155. package/types/models/smolvlm/modeling_smolvlm.d.ts +8 -0
  156. package/types/models/smolvlm/modeling_smolvlm.d.ts.map +1 -0
  157. package/types/models/solar_open/modeling_solar_open.d.ts +8 -0
  158. package/types/models/solar_open/modeling_solar_open.d.ts.map +1 -0
  159. package/types/models/ultravox/modeling_ultravox.d.ts +0 -2
  160. package/types/models/ultravox/modeling_ultravox.d.ts.map +1 -1
  161. package/types/models/voxtral/modeling_voxtral.d.ts +4 -0
  162. package/types/models/voxtral/modeling_voxtral.d.ts.map +1 -0
  163. package/types/models/voxtral_realtime/feature_extraction_voxtral_realtime.d.ts +28 -0
  164. package/types/models/voxtral_realtime/feature_extraction_voxtral_realtime.d.ts.map +1 -0
  165. package/types/models/voxtral_realtime/modeling_voxtral_realtime.d.ts +17 -0
  166. package/types/models/voxtral_realtime/modeling_voxtral_realtime.d.ts.map +1 -0
  167. package/types/models/voxtral_realtime/processing_voxtral_realtime.d.ts +44 -0
  168. package/types/models/voxtral_realtime/processing_voxtral_realtime.d.ts.map +1 -0
  169. package/types/models/whisper/feature_extraction_whisper.d.ts.map +1 -1
  170. package/types/pipelines.d.ts +1 -0
  171. package/types/pipelines.d.ts.map +1 -1
  172. package/types/transformers.d.ts +1 -0
  173. package/types/transformers.d.ts.map +1 -1
  174. package/types/utils/audio.d.ts +5 -2
  175. package/types/utils/audio.d.ts.map +1 -1
  176. package/types/utils/cache/CrossOriginStorageCache.d.ts +120 -0
  177. package/types/utils/cache/CrossOriginStorageCache.d.ts.map +1 -0
  178. package/types/utils/cache.d.ts.map +1 -1
  179. package/types/utils/dtypes.d.ts +1 -1
  180. package/types/utils/hub.d.ts.map +1 -1
  181. package/types/utils/image.d.ts +1 -1
  182. package/types/utils/lru_cache.d.ts +38 -0
  183. package/types/utils/lru_cache.d.ts.map +1 -0
  184. package/types/utils/memoize_promise.d.ts +14 -0
  185. package/types/utils/memoize_promise.d.ts.map +1 -0
  186. package/types/utils/model_registry/get_file_metadata.d.ts.map +1 -1
  187. package/types/utils/model_registry/get_model_files.d.ts +1 -0
  188. package/types/utils/model_registry/get_model_files.d.ts.map +1 -1
  189. package/types/utils/tensor.d.ts.map +1 -1
  190. package/src/utils/data-structures.js +0 -572
  191. package/types/models/ast/modeling_ast.d.ts.map +0 -1
  192. package/types/utils/data-structures.d.ts +0 -294
  193. package/types/utils/data-structures.d.ts.map +0 -1
  194. /package/src/models/{ast/modeling_ast.js → audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.js} +0 -0
@@ -117,6 +117,9 @@ __export(transformers_exports, {
117
117
  BloomModel: () => BloomModel,
118
118
  BloomPreTrainedModel: () => BloomPreTrainedModel,
119
119
  BloomTokenizer: () => BloomTokenizer,
120
+ CHMv2ForDepthEstimation: () => CHMv2ForDepthEstimation,
121
+ CHMv2ImageProcessor: () => CHMv2ImageProcessor,
122
+ CHMv2PreTrainedModel: () => CHMv2PreTrainedModel,
120
123
  CLIPFeatureExtractor: () => CLIPFeatureExtractor,
121
124
  CLIPImageProcessor: () => CLIPImageProcessor,
122
125
  CLIPModel: () => CLIPModel,
@@ -212,6 +215,9 @@ __export(transformers_exports, {
212
215
  DebertaV2Tokenizer: () => DebertaV2Tokenizer,
213
216
  DecisionTransformerModel: () => DecisionTransformerModel,
214
217
  DecisionTransformerPreTrainedModel: () => DecisionTransformerPreTrainedModel,
218
+ DeepseekV3ForCausalLM: () => DeepseekV3ForCausalLM,
219
+ DeepseekV3Model: () => DeepseekV3Model,
220
+ DeepseekV3PreTrainedModel: () => DeepseekV3PreTrainedModel,
215
221
  DeiTFeatureExtractor: () => DeiTFeatureExtractor,
216
222
  DeiTForImageClassification: () => DeiTForImageClassification,
217
223
  DeiTImageProcessor: () => DeiTImageProcessor,
@@ -248,6 +254,7 @@ __export(transformers_exports, {
248
254
  DonutImageProcessor: () => DonutImageProcessor,
249
255
  DonutSwinModel: () => DonutSwinModel,
250
256
  DonutSwinPreTrainedModel: () => DonutSwinPreTrainedModel,
257
+ DynamicCache: () => DynamicCache,
251
258
  EdgeTamModel: () => EdgeTamModel,
252
259
  EfficientNetForImageClassification: () => EfficientNetForImageClassification,
253
260
  EfficientNetImageProcessor: () => EfficientNetImageProcessor,
@@ -271,6 +278,11 @@ __export(transformers_exports, {
271
278
  EsmModel: () => EsmModel,
272
279
  EsmPreTrainedModel: () => EsmPreTrainedModel,
273
280
  EsmTokenizer: () => EsmTokenizer,
281
+ EuroBertForMaskedLM: () => EuroBertForMaskedLM,
282
+ EuroBertForSequenceClassification: () => EuroBertForSequenceClassification,
283
+ EuroBertForTokenClassification: () => EuroBertForTokenClassification,
284
+ EuroBertModel: () => EuroBertModel,
285
+ EuroBertPreTrainedModel: () => EuroBertPreTrainedModel,
274
286
  ExaoneForCausalLM: () => ExaoneForCausalLM,
275
287
  ExaoneModel: () => ExaoneModel,
276
288
  ExaonePreTrainedModel: () => ExaonePreTrainedModel,
@@ -320,6 +332,7 @@ __export(transformers_exports, {
320
332
  Gemma3Model: () => Gemma3Model,
321
333
  Gemma3PreTrainedModel: () => Gemma3PreTrainedModel,
322
334
  Gemma3nAudioFeatureExtractor: () => Gemma3nAudioFeatureExtractor,
335
+ Gemma3nForCausalLM: () => Gemma3nForCausalLM,
323
336
  Gemma3nForConditionalGeneration: () => Gemma3nForConditionalGeneration,
324
337
  Gemma3nPreTrainedModel: () => Gemma3nPreTrainedModel,
325
338
  Gemma3nProcessor: () => Gemma3nProcessor,
@@ -327,8 +340,14 @@ __export(transformers_exports, {
327
340
  GemmaModel: () => GemmaModel,
328
341
  GemmaPreTrainedModel: () => GemmaPreTrainedModel,
329
342
  GemmaTokenizer: () => GemmaTokenizer,
343
+ Glm46VImageProcessor: () => Glm46VImageProcessor,
344
+ Glm46VProcessor: () => Glm46VProcessor,
330
345
  GlmForCausalLM: () => GlmForCausalLM,
331
346
  GlmModel: () => GlmModel,
347
+ GlmMoeDsaForCausalLM: () => GlmMoeDsaForCausalLM,
348
+ GlmMoeDsaModel: () => GlmMoeDsaModel,
349
+ GlmMoeDsaPreTrainedModel: () => GlmMoeDsaPreTrainedModel,
350
+ GlmOcrForConditionalGeneration: () => GlmOcrForConditionalGeneration,
332
351
  GlmPreTrainedModel: () => GlmPreTrainedModel,
333
352
  GptOssForCausalLM: () => GptOssForCausalLM,
334
353
  GptOssModel: () => GptOssModel,
@@ -339,6 +358,9 @@ __export(transformers_exports, {
339
358
  GraniteMoeHybridModel: () => GraniteMoeHybridModel,
340
359
  GraniteMoeHybridPreTrainedModel: () => GraniteMoeHybridPreTrainedModel,
341
360
  GranitePreTrainedModel: () => GranitePreTrainedModel,
361
+ GraniteSpeechFeatureExtractor: () => GraniteSpeechFeatureExtractor,
362
+ GraniteSpeechForConditionalGeneration: () => GraniteSpeechForConditionalGeneration,
363
+ GraniteSpeechProcessor: () => GraniteSpeechProcessor,
342
364
  GroundingDinoForObjectDetection: () => GroundingDinoForObjectDetection,
343
365
  GroundingDinoImageProcessor: () => GroundingDinoImageProcessor,
344
366
  GroundingDinoPreTrainedModel: () => GroundingDinoPreTrainedModel,
@@ -364,7 +386,6 @@ __export(transformers_exports, {
364
386
  IJepaPreTrainedModel: () => IJepaPreTrainedModel,
365
387
  Idefics3ForConditionalGeneration: () => Idefics3ForConditionalGeneration,
366
388
  Idefics3ImageProcessor: () => Idefics3ImageProcessor,
367
- Idefics3PreTrainedModel: () => Idefics3PreTrainedModel,
368
389
  Idefics3Processor: () => Idefics3Processor,
369
390
  ImageClassificationPipeline: () => ImageClassificationPipeline,
370
391
  ImageFeatureExtractionPipeline: () => ImageFeatureExtractionPipeline,
@@ -389,6 +410,10 @@ __export(transformers_exports, {
389
410
  Lfm2MoeModel: () => Lfm2MoeModel,
390
411
  Lfm2MoePreTrainedModel: () => Lfm2MoePreTrainedModel,
391
412
  Lfm2PreTrainedModel: () => Lfm2PreTrainedModel,
413
+ Lfm2VlForConditionalGeneration: () => Lfm2VlForConditionalGeneration,
414
+ Lfm2VlImageProcessor: () => Lfm2VlImageProcessor,
415
+ Lfm2VlProcessor: () => Lfm2VlProcessor,
416
+ LightOnOcrForConditionalGeneration: () => LightOnOcrForConditionalGeneration,
392
417
  LiteWhisperForConditionalGeneration: () => LiteWhisperForConditionalGeneration,
393
418
  Llama4ForCausalLM: () => Llama4ForCausalLM,
394
419
  Llama4PreTrainedModel: () => Llama4PreTrainedModel,
@@ -458,6 +483,9 @@ __export(transformers_exports, {
458
483
  MimiPreTrainedModel: () => MimiPreTrainedModel,
459
484
  MinLengthLogitsProcessor: () => MinLengthLogitsProcessor,
460
485
  MinNewTokensLengthLogitsProcessor: () => MinNewTokensLengthLogitsProcessor,
486
+ Mistral4ForCausalLM: () => Mistral4ForCausalLM,
487
+ Mistral4Model: () => Mistral4Model,
488
+ Mistral4PreTrainedModel: () => Mistral4PreTrainedModel,
461
489
  MistralForCausalLM: () => MistralForCausalLM,
462
490
  MistralModel: () => MistralModel,
463
491
  MistralPreTrainedModel: () => MistralPreTrainedModel,
@@ -529,6 +557,9 @@ __export(transformers_exports, {
529
557
  NanoChatForCausalLM: () => NanoChatForCausalLM,
530
558
  NanoChatModel: () => NanoChatModel,
531
559
  NanoChatPreTrainedModel: () => NanoChatPreTrainedModel,
560
+ NemotronHForCausalLM: () => NemotronHForCausalLM,
561
+ NemotronHModel: () => NemotronHModel,
562
+ NemotronHPreTrainedModel: () => NemotronHPreTrainedModel,
532
563
  NeoBertForMaskedLM: () => NeoBertForMaskedLM,
533
564
  NeoBertForQuestionAnswering: () => NeoBertForQuestionAnswering,
534
565
  NeoBertForSequenceClassification: () => NeoBertForSequenceClassification,
@@ -572,7 +603,6 @@ __export(transformers_exports, {
572
603
  Owlv2Model: () => Owlv2Model,
573
604
  Owlv2PreTrainedModel: () => Owlv2PreTrainedModel,
574
605
  PaliGemmaForConditionalGeneration: () => PaliGemmaForConditionalGeneration,
575
- PaliGemmaPreTrainedModel: () => PaliGemmaPreTrainedModel,
576
606
  PaliGemmaProcessor: () => PaliGemmaProcessor,
577
607
  ParakeetFeatureExtractor: () => ParakeetFeatureExtractor,
578
608
  ParakeetForCTC: () => ParakeetForCTC,
@@ -616,10 +646,12 @@ __export(transformers_exports, {
616
646
  Qwen2MoePreTrainedModel: () => Qwen2MoePreTrainedModel,
617
647
  Qwen2PreTrainedModel: () => Qwen2PreTrainedModel,
618
648
  Qwen2Tokenizer: () => Qwen2Tokenizer,
649
+ Qwen2VLForCausalLM: () => Qwen2VLForCausalLM,
619
650
  Qwen2VLForConditionalGeneration: () => Qwen2VLForConditionalGeneration,
620
651
  Qwen2VLImageProcessor: () => Qwen2VLImageProcessor,
621
652
  Qwen2VLPreTrainedModel: () => Qwen2VLPreTrainedModel,
622
653
  Qwen2VLProcessor: () => Qwen2VLProcessor,
654
+ Qwen2_5_VLForCausalLM: () => Qwen2_5_VLForCausalLM,
623
655
  Qwen2_5_VLForConditionalGeneration: () => Qwen2_5_VLForConditionalGeneration,
624
656
  Qwen2_5_VLProcessor: () => Qwen2_5_VLProcessor,
625
657
  Qwen3ForCausalLM: () => Qwen3ForCausalLM,
@@ -631,10 +663,14 @@ __export(transformers_exports, {
631
663
  Qwen3NextModel: () => Qwen3NextModel,
632
664
  Qwen3NextPreTrainedModel: () => Qwen3NextPreTrainedModel,
633
665
  Qwen3PreTrainedModel: () => Qwen3PreTrainedModel,
666
+ Qwen3VLForCausalLM: () => Qwen3VLForCausalLM,
634
667
  Qwen3VLForConditionalGeneration: () => Qwen3VLForConditionalGeneration,
668
+ Qwen3VLMoeForCausalLM: () => Qwen3VLMoeForCausalLM,
635
669
  Qwen3VLMoeForConditionalGeneration: () => Qwen3VLMoeForConditionalGeneration,
636
670
  Qwen3VLProcessor: () => Qwen3VLProcessor,
671
+ Qwen3_5ForCausalLM: () => Qwen3_5ForCausalLM,
637
672
  Qwen3_5ForConditionalGeneration: () => Qwen3_5ForConditionalGeneration,
673
+ Qwen3_5MoeForCausalLM: () => Qwen3_5MoeForCausalLM,
638
674
  Qwen3_5MoeForConditionalGeneration: () => Qwen3_5MoeForConditionalGeneration,
639
675
  RFDetrForObjectDetection: () => RFDetrForObjectDetection,
640
676
  RFDetrModel: () => RFDetrModel,
@@ -706,7 +742,6 @@ __export(transformers_exports, {
706
742
  SmolLM3ForCausalLM: () => SmolLM3ForCausalLM,
707
743
  SmolLM3Model: () => SmolLM3Model,
708
744
  SmolLM3PreTrainedModel: () => SmolLM3PreTrainedModel,
709
- SmolVLMForConditionalGeneration: () => SmolVLMForConditionalGeneration,
710
745
  SmolVLMImageProcessor: () => Idefics3ImageProcessor,
711
746
  SmolVLMProcessor: () => Idefics3Processor,
712
747
  SnacDecoderModel: () => SnacDecoderModel,
@@ -714,6 +749,9 @@ __export(transformers_exports, {
714
749
  SnacFeatureExtractor: () => SnacFeatureExtractor,
715
750
  SnacModel: () => SnacModel,
716
751
  SnacPreTrainedModel: () => SnacPreTrainedModel,
752
+ SolarOpenForCausalLM: () => SolarOpenForCausalLM,
753
+ SolarOpenModel: () => SolarOpenModel,
754
+ SolarOpenPreTrainedModel: () => SolarOpenPreTrainedModel,
717
755
  SpeechT5FeatureExtractor: () => SpeechT5FeatureExtractor,
718
756
  SpeechT5ForSpeechToText: () => SpeechT5ForSpeechToText,
719
757
  SpeechT5ForTextToSpeech: () => SpeechT5ForTextToSpeech,
@@ -812,6 +850,10 @@ __export(transformers_exports, {
812
850
  VitsTokenizer: () => VitsTokenizer,
813
851
  VoxtralForConditionalGeneration: () => VoxtralForConditionalGeneration,
814
852
  VoxtralProcessor: () => VoxtralProcessor,
853
+ VoxtralRealtimeFeatureExtractor: () => VoxtralRealtimeFeatureExtractor,
854
+ VoxtralRealtimeForConditionalGeneration: () => VoxtralRealtimeForConditionalGeneration,
855
+ VoxtralRealtimePreTrainedModel: () => VoxtralRealtimePreTrainedModel,
856
+ VoxtralRealtimeProcessor: () => VoxtralRealtimeProcessor,
815
857
  Wav2Vec2BertForCTC: () => Wav2Vec2BertForCTC,
816
858
  Wav2Vec2BertForSequenceClassification: () => Wav2Vec2BertForSequenceClassification,
817
859
  Wav2Vec2BertModel: () => Wav2Vec2BertModel,
@@ -910,7 +952,7 @@ var import_node_fs = __toESM(require("fs"), 1);
910
952
  var import_node_path = __toESM(require("path"), 1);
911
953
  var import_node_url = __toESM(require("url"), 1);
912
954
  var import_meta = {};
913
- var VERSION = "4.0.0-next.6";
955
+ var VERSION = "4.0.0-next.8";
914
956
  var HAS_SELF = typeof self !== "undefined";
915
957
  var IS_FS_AVAILABLE = !isEmpty(import_node_fs.default);
916
958
  var IS_PATH_AVAILABLE = !isEmpty(import_node_path.default);
@@ -1038,6 +1080,7 @@ var env = {
1038
1080
  customCache: null,
1039
1081
  useWasmCache: IS_WEB_CACHE_AVAILABLE || IS_FS_AVAILABLE,
1040
1082
  cacheKey: "transformers-cache",
1083
+ experimental_useCrossOriginStorage: false,
1041
1084
  /////////////////// Custom fetch /////////////////////
1042
1085
  fetch: DEFAULT_FETCH
1043
1086
  //////////////////////////////////////////////////////
@@ -1139,7 +1182,7 @@ var logger = {
1139
1182
  }
1140
1183
  };
1141
1184
 
1142
- // ../../node_modules/.pnpm/@huggingface+tokenizers@0.1.2/node_modules/@huggingface/tokenizers/dist/tokenizers.mjs
1185
+ // ../../node_modules/.pnpm/@huggingface+tokenizers@0.1.3/node_modules/@huggingface/tokenizers/dist/tokenizers.mjs
1143
1186
  var DictionarySplitter = class {
1144
1187
  /**
1145
1188
  * @param dictionary The dictionary of words to use for splitting.
@@ -2795,10 +2838,10 @@ var BPE = class extends TokenizerModel_default {
2795
2838
  );
2796
2839
  if (byte_tokens.every((x) => this.tokens_to_ids.has(x))) {
2797
2840
  output_tokens.push(...byte_tokens);
2798
- } else {
2841
+ } else if (this.unk_token != null) {
2799
2842
  output_tokens.push(this.unk_token);
2800
2843
  }
2801
- } else {
2844
+ } else if (this.unk_token != null) {
2802
2845
  output_tokens.push(this.unk_token);
2803
2846
  }
2804
2847
  }
@@ -3588,7 +3631,7 @@ var Tokenizer = class {
3588
3631
  };
3589
3632
  var Tokenizer_default = Tokenizer;
3590
3633
 
3591
- // ../../node_modules/.pnpm/@huggingface+jinja@0.5.5/node_modules/@huggingface/jinja/dist/index.js
3634
+ // ../../node_modules/.pnpm/@huggingface+jinja@0.5.6/node_modules/@huggingface/jinja/dist/index.js
3592
3635
  var TOKEN_TYPES = Object.freeze({
3593
3636
  Text: "Text",
3594
3637
  // The text between Jinja statements or expressions
@@ -5107,7 +5150,11 @@ var Environment = class {
5107
5150
  ["number", (operand) => operand instanceof IntegerValue || operand instanceof FloatValue],
5108
5151
  ["integer", (operand) => operand instanceof IntegerValue],
5109
5152
  ["iterable", (operand) => operand.type === "ArrayValue" || operand.type === "StringValue"],
5110
- ["mapping", (operand) => operand.type === "ObjectValue"],
5153
+ ["mapping", (operand) => operand instanceof ObjectValue],
5154
+ [
5155
+ "sequence",
5156
+ (operand) => operand instanceof ArrayValue || operand instanceof ObjectValue || operand instanceof StringValue
5157
+ ],
5111
5158
  [
5112
5159
  "lower",
5113
5160
  (operand) => {
@@ -5380,6 +5427,9 @@ var Interpreter = class {
5380
5427
  applyFilter(operand, filterNode, environment) {
5381
5428
  if (filterNode.type === "Identifier") {
5382
5429
  const filter = filterNode;
5430
+ if (filter.value === "safe") {
5431
+ return operand;
5432
+ }
5383
5433
  if (filter.value === "tojson") {
5384
5434
  return new StringValue(toJSON(operand, {}));
5385
5435
  }
@@ -5469,6 +5519,8 @@ var Interpreter = class {
5469
5519
  return new IntegerValue(Math.floor(operand.value));
5470
5520
  case "float":
5471
5521
  return new FloatValue(operand.value);
5522
+ case "string":
5523
+ return new StringValue(operand.toString());
5472
5524
  default:
5473
5525
  throw new Error(`Unknown NumericValue filter: ${filter.value}`);
5474
5526
  }
@@ -6897,9 +6949,216 @@ function toAbsoluteURL(url2) {
6897
6949
  return new URL(url2, baseURL).href;
6898
6950
  }
6899
6951
 
6952
+ // src/utils/cache/CrossOriginStorageCache.js
6953
+ var HASH_ALGORITHM = "SHA-256";
6954
+ var HASH_CACHE_NAME = "experimental_transformers-hash-cache";
6955
+ var makeHashDescriptor = (value) => ({ algorithm: HASH_ALGORITHM, value });
6956
+ var CrossOriginStorage = class {
6957
+ /** @type {Promise<Cache> | null} */
6958
+ #hashCache = null;
6959
+ /**
6960
+ * Returns (and lazily opens) the hash cache, reusing the same promise across concurrent callers.
6961
+ * @returns {Promise<Cache>}
6962
+ */
6963
+ _getHashCache = () => {
6964
+ this.#hashCache ??= caches.open(HASH_CACHE_NAME);
6965
+ return this.#hashCache;
6966
+ };
6967
+ /**
6968
+ * Returns whether the `navigator.crossOriginStorage` API is available in the current environment.
6969
+ * @returns {boolean}
6970
+ */
6971
+ static isAvailable = () => typeof navigator !== "undefined" && "crossOriginStorage" in navigator;
6972
+ /**
6973
+ * Looks up a cached response for the given URL by resolving its SHA-256 hash and requesting
6974
+ * the corresponding file handle from cross-origin storage.
6975
+ *
6976
+ * Implements `CacheInterface.match`.
6977
+ *
6978
+ * @param {string} request The URL of the resource to look up.
6979
+ * @returns {Promise<Response|undefined>} The cached `Response`, or `undefined` if not found.
6980
+ */
6981
+ match = async (request) => {
6982
+ const hashValue = await this._getFileHash(request);
6983
+ if (!hashValue) {
6984
+ return void 0;
6985
+ }
6986
+ try {
6987
+ const [handle] = await navigator.crossOriginStorage.requestFileHandles([makeHashDescriptor(hashValue)]);
6988
+ const blob = await handle.getFile();
6989
+ return new Response(blob, {
6990
+ headers: {
6991
+ "Content-Length": String(blob.size)
6992
+ }
6993
+ });
6994
+ } catch {
6995
+ return void 0;
6996
+ }
6997
+ };
6998
+ /**
6999
+ * Stores a response in cross-origin storage, keyed by its SHA-256 hash.
7000
+ *
7001
+ * For LFS-backed URLs the hash is resolved cheaply via `_getFileHash` (which checks
7002
+ * `HASH_CACHE_NAME` first, then falls back to fetching the Git LFS pointer file)
7003
+ * without reading the response body a second time.
7004
+ *
7005
+ * For non-LFS resources the hash is unknown upfront. In that case the body is consumed
7006
+ * in the background: the stream is read to compute the content hash, the file is written
7007
+ * into cross-origin storage, and the computed hash is persisted to `HASH_CACHE_NAME`
7008
+ * so that future `match` calls can resolve the file without a network round-trip.
7009
+ *
7010
+ * Implements `CacheInterface.put`.
7011
+ *
7012
+ * @param {string} request The URL of the resource (used as the hash-cache key).
7013
+ * @param {Response} response The response whose body will be written to the cache.
7014
+ * @returns {Promise<void>}
7015
+ */
7016
+ put = async (request, response) => {
7017
+ const hashValue = await this._getFileHash(request);
7018
+ if (hashValue) {
7019
+ const blob = await response.blob();
7020
+ await this._storeBlobInCOS(blob, hashValue);
7021
+ } else {
7022
+ this._processAndStore(request, response.body);
7023
+ }
7024
+ };
7025
+ /**
7026
+ * Writes a blob into cross-origin storage using the given pre-computed hex hash string.
7027
+ *
7028
+ * @param {Blob} blob
7029
+ * @param {string} hashHex Hex-encoded SHA-256 hash of `blob`.
7030
+ * @returns {Promise<void>}
7031
+ */
7032
+ _storeBlobInCOS = async (blob, hashHex) => {
7033
+ const [handle] = await navigator.crossOriginStorage.requestFileHandles([makeHashDescriptor(hashHex)], {
7034
+ create: true
7035
+ });
7036
+ const writableStream = await handle.createWritable();
7037
+ await writableStream.write(blob);
7038
+ await writableStream.close();
7039
+ };
7040
+ /**
7041
+ * Background task for non-LFS resources: consumes `stream`, computes the SHA-256 hash
7042
+ * of the resulting blob, stores it in cross-origin storage, and persists the computed
7043
+ * hash to `HASH_CACHE_NAME` keyed by `request` so future `match` calls can resolve the
7044
+ * file without a network round-trip.
7045
+ *
7046
+ * Called fire-and-forget from `put` — errors are swallowed so failures never surface to
7047
+ * the caller.
7048
+ *
7049
+ * @param {string} request The original resource URL.
7050
+ * @param {ReadableStream} stream The response body stream to consume.
7051
+ * @returns {Promise<void>}
7052
+ */
7053
+ _processAndStore = async (request, stream) => {
7054
+ try {
7055
+ const chunks = [];
7056
+ for await (const chunk2 of stream) {
7057
+ chunks.push(chunk2);
7058
+ }
7059
+ const blob = new Blob(chunks);
7060
+ const hashHex = await this._getBlobHash(blob);
7061
+ await this._storeBlobInCOS(blob, hashHex);
7062
+ try {
7063
+ const hashCache = await this._getHashCache();
7064
+ await hashCache.put(request, new Response(hashHex));
7065
+ } catch {
7066
+ }
7067
+ } catch {
7068
+ }
7069
+ };
7070
+ /**
7071
+ * Deletes the cache entry for the given request.
7072
+ *
7073
+ * Removes the hash entry from `HASH_CACHE_NAME`. Note: cross-origin storage itself does not
7074
+ * expose a delete API, so only the local hash mapping is removed. For non-LFS URLs this
7075
+ * permanently prevents `match` from resolving the file. For LFS-backed URLs, `match` will
7076
+ * re-fetch the LFS pointer file on the next call and repopulate the hash cache automatically.
7077
+ *
7078
+ * Implements `CacheInterface.delete`.
7079
+ *
7080
+ * @param {string} request
7081
+ * @returns {Promise<boolean>} Resolves to `true` if the hash entry was deleted, `false` otherwise.
7082
+ */
7083
+ delete = async (request) => {
7084
+ try {
7085
+ const hashCache = await this._getHashCache();
7086
+ return await hashCache.delete(request);
7087
+ } catch {
7088
+ return false;
7089
+ }
7090
+ };
7091
+ /**
7092
+ * Resolves the SHA-256 hash for a given URL.
7093
+ *
7094
+ * Returns the cached hash immediately if one has been persisted to `HASH_CACHE_NAME`.
7095
+ * Otherwise falls back to `_getLfsFileHash` to retrieve the hash from the Hugging Face
7096
+ * LFS pointer file, persisting the result to `HASH_CACHE_NAME` for future lookups.
7097
+ *
7098
+ * Returns `null` if the hash cannot be determined (e.g. non-LFS URL with no cached entry).
7099
+ *
7100
+ * @param {string} url The resource URL to resolve a hash for.
7101
+ * @returns {Promise<string|null>} The hex-encoded SHA-256 hash, or `null` if unavailable.
7102
+ */
7103
+ _getFileHash = async (url2) => {
7104
+ try {
7105
+ const hashCache = await this._getHashCache();
7106
+ const cached = await hashCache.match(url2);
7107
+ if (cached) {
7108
+ return cached.text();
7109
+ }
7110
+ const hash = await this._getLfsFileHash(url2);
7111
+ if (hash) {
7112
+ await hashCache.put(url2, new Response(hash));
7113
+ return hash;
7114
+ }
7115
+ return null;
7116
+ } catch {
7117
+ return null;
7118
+ }
7119
+ };
7120
+ /**
7121
+ * Attempts to retrieve the SHA-256 hash for a Hugging Face resource URL from its raw
7122
+ * Git LFS pointer file.
7123
+ *
7124
+ * Only applicable to URLs containing `/resolve/` (i.e. Hugging Face resolved file URLs).
7125
+ * The `/resolve/` segment is rewritten to `/raw/` to fetch the LFS pointer directly.
7126
+ * Returns `null` for non-LFS URLs or when the network request fails.
7127
+ *
7128
+ * @see https://huggingface.co/docs/hub/en/storage-backends#xet
7129
+ * @param {string} url The resolved Hugging Face URL of the resource.
7130
+ * @returns {Promise<string|null>} The hex-encoded SHA-256 hash, or `null` if unavailable.
7131
+ */
7132
+ _getLfsFileHash = async (url2) => {
7133
+ if (!url2.includes("/resolve/")) {
7134
+ return null;
7135
+ }
7136
+ const rawUrl = url2.replace("/resolve/", "/raw/");
7137
+ try {
7138
+ const text = await fetch(rawUrl).then((r) => r.text());
7139
+ const match = text.match(/^oid sha256:([0-9a-f]+)$/m);
7140
+ return match ? match[1] : null;
7141
+ } catch {
7142
+ return null;
7143
+ }
7144
+ };
7145
+ /**
7146
+ * Computes the SHA-256 hash of a `Blob`'s contents.
7147
+ *
7148
+ * @param {Blob} blob The blob to hash.
7149
+ * @returns {Promise<string>} The lowercase hex-encoded SHA-256 hash.
7150
+ */
7151
+ _getBlobHash = async (blob) => {
7152
+ const arrayBuffer = await blob.arrayBuffer();
7153
+ const hashBuffer = await crypto.subtle.digest(HASH_ALGORITHM, arrayBuffer);
7154
+ const hashArray = Array.from(new Uint8Array(hashBuffer));
7155
+ return hashArray.map((byte) => byte.toString(16).padStart(2, "0")).join("");
7156
+ };
7157
+ };
7158
+
6900
7159
  // src/utils/cache.js
6901
7160
  async function getCache(file_cache_dir = null) {
6902
- let cache = null;
7161
+ let cache2 = null;
6903
7162
  if (env.useCustomCache) {
6904
7163
  if (!env.customCache) {
6905
7164
  throw Error("`env.useCustomCache=true`, but `env.customCache` is not defined.");
@@ -6909,30 +7168,33 @@ async function getCache(file_cache_dir = null) {
6909
7168
  "`env.customCache` must be an object which implements the `match` and `put` functions of the Web Cache API. For more information, see https://developer.mozilla.org/en-US/docs/Web/API/Cache"
6910
7169
  );
6911
7170
  }
6912
- cache = env.customCache;
7171
+ cache2 = env.customCache;
6913
7172
  }
6914
- if (!cache && env.useBrowserCache) {
7173
+ if (!cache2 && env.experimental_useCrossOriginStorage && CrossOriginStorage.isAvailable()) {
7174
+ cache2 = new CrossOriginStorage();
7175
+ }
7176
+ if (!cache2 && env.useBrowserCache) {
6915
7177
  if (typeof caches === "undefined") {
6916
7178
  throw Error("Browser cache is not available in this environment.");
6917
7179
  }
6918
7180
  try {
6919
- cache = await caches.open(env.cacheKey);
7181
+ cache2 = await caches.open(env.cacheKey);
6920
7182
  } catch (e) {
6921
7183
  logger.warn("An error occurred while opening the browser cache:", e);
6922
7184
  }
6923
7185
  }
6924
- if (!cache && env.useFSCache) {
7186
+ if (!cache2 && env.useFSCache) {
6925
7187
  if (!apis.IS_FS_AVAILABLE) {
6926
7188
  throw Error("File System Cache is not available in this environment.");
6927
7189
  }
6928
- cache = new FileCache(file_cache_dir ?? env.cacheDir);
7190
+ cache2 = new FileCache(file_cache_dir ?? env.cacheDir);
6929
7191
  }
6930
- return cache;
7192
+ return cache2;
6931
7193
  }
6932
- async function tryCache(cache, ...names) {
7194
+ async function tryCache(cache2, ...names) {
6933
7195
  for (let name of names) {
6934
7196
  try {
6935
- let result = await cache.match(name);
7197
+ let result = await cache2.match(name);
6936
7198
  if (result) return result;
6937
7199
  } catch (e) {
6938
7200
  continue;
@@ -6941,6 +7203,83 @@ async function tryCache(cache, ...names) {
6941
7203
  return void 0;
6942
7204
  }
6943
7205
 
7206
+ // src/utils/lru_cache.js
7207
+ var LRUCache2 = class {
7208
+ /** @type {number} */
7209
+ #capacity;
7210
+ /** @type {Map<any, any>} */
7211
+ #cache;
7212
+ /**
7213
+ * Creates an LRUCache instance.
7214
+ * @param {number} capacity The maximum number of items the cache can hold.
7215
+ */
7216
+ constructor(capacity) {
7217
+ this.#capacity = capacity;
7218
+ this.#cache = /* @__PURE__ */ new Map();
7219
+ }
7220
+ /**
7221
+ * Retrieves the value associated with the given key and marks the key as recently used.
7222
+ * @param {any} key The key to retrieve.
7223
+ * @returns {any} The value associated with the key, or undefined if the key does not exist.
7224
+ */
7225
+ get(key) {
7226
+ if (!this.#cache.has(key)) return void 0;
7227
+ const value = this.#cache.get(key);
7228
+ this.#cache.delete(key);
7229
+ this.#cache.set(key, value);
7230
+ return value;
7231
+ }
7232
+ /**
7233
+ * Inserts or updates the key-value pair in the cache.
7234
+ * If the key already exists, it is updated and marked as recently used.
7235
+ * If the cache exceeds its capacity, the least recently used item is evicted.
7236
+ * @param {any} key The key to add or update.
7237
+ * @param {any} value The value to associate with the key.
7238
+ */
7239
+ put(key, value) {
7240
+ if (this.#cache.has(key)) {
7241
+ this.#cache.delete(key);
7242
+ }
7243
+ this.#cache.set(key, value);
7244
+ if (this.#cache.size > this.#capacity) {
7245
+ this.#cache.delete(this.#cache.keys().next().value);
7246
+ }
7247
+ }
7248
+ /**
7249
+ * Removes the entry for the given key from the cache.
7250
+ * @param {any} key The key to delete.
7251
+ * @returns {boolean} `true` if the entry existed and was removed, `false` otherwise.
7252
+ */
7253
+ delete(key) {
7254
+ return this.#cache.delete(key);
7255
+ }
7256
+ /**
7257
+ * Clears the cache.
7258
+ */
7259
+ clear() {
7260
+ this.#cache.clear();
7261
+ }
7262
+ };
7263
+
7264
+ // src/utils/memoize_promise.js
7265
+ var MAX_CACHE_SIZE = 100;
7266
+ var cache = new LRUCache2(MAX_CACHE_SIZE);
7267
+ function memoizePromise(key, factory) {
7268
+ const cached = cache.get(key);
7269
+ if (cached !== void 0) {
7270
+ return cached;
7271
+ }
7272
+ const promise = factory().then(
7273
+ (value) => value,
7274
+ (err) => {
7275
+ cache.delete(key);
7276
+ return Promise.reject(err);
7277
+ }
7278
+ );
7279
+ cache.put(key, promise);
7280
+ return promise;
7281
+ }
7282
+
6944
7283
  // src/utils/model_registry/get_file_metadata.js
6945
7284
  async function fetch_file_head(urlOrPath) {
6946
7285
  if (!isValidUrl(urlOrPath, ["http:", "https:"])) {
@@ -6948,17 +7287,27 @@ async function fetch_file_head(urlOrPath) {
6948
7287
  }
6949
7288
  const headers = getFetchHeaders(urlOrPath);
6950
7289
  headers.set("Range", "bytes=0-0");
6951
- return env.fetch(urlOrPath, { method: "GET", headers });
7290
+ return env.fetch(urlOrPath, { method: "GET", headers, cache: "no-store" });
7291
+ }
7292
+ function get_file_metadata(path_or_repo_id, filename, options = {}) {
7293
+ const key = JSON.stringify([
7294
+ path_or_repo_id,
7295
+ filename,
7296
+ options?.revision,
7297
+ options?.cache_dir,
7298
+ options?.local_files_only
7299
+ ]);
7300
+ return memoizePromise(key, () => _get_file_metadata(path_or_repo_id, filename, options));
6952
7301
  }
6953
- async function get_file_metadata(path_or_repo_id, filename, options = {}) {
6954
- const cache = await getCache(options?.cache_dir);
7302
+ async function _get_file_metadata(path_or_repo_id, filename, options) {
7303
+ const cache2 = await getCache(options?.cache_dir);
6955
7304
  const { localPath, remoteURL, proposedCacheKey, validModelId } = buildResourcePaths(
6956
7305
  path_or_repo_id,
6957
7306
  filename,
6958
7307
  options,
6959
- cache
7308
+ cache2
6960
7309
  );
6961
- const cachedResponse = await checkCachedResource(cache, localPath, proposedCacheKey);
7310
+ const cachedResponse = await checkCachedResource(cache2, localPath, proposedCacheKey);
6962
7311
  if (cachedResponse !== void 0 && typeof cachedResponse !== "string") {
6963
7312
  const size = cachedResponse.headers.get("content-length");
6964
7313
  const contentType = cachedResponse.headers.get("content-type");
@@ -7056,7 +7405,7 @@ function getFetchHeaders(urlOrPath) {
7056
7405
  }
7057
7406
  return headers;
7058
7407
  }
7059
- function buildResourcePaths(path_or_repo_id, filename, options = {}, cache = null) {
7408
+ function buildResourcePaths(path_or_repo_id, filename, options = {}, cache2 = null) {
7060
7409
  const revision = options.revision ?? "main";
7061
7410
  const requestURL = pathJoin(path_or_repo_id, filename);
7062
7411
  const validModelId = isValidHfModelId(path_or_repo_id);
@@ -7066,7 +7415,7 @@ function buildResourcePaths(path_or_repo_id, filename, options = {}, cache = nul
7066
7415
  env.remotePathTemplate.replaceAll("{model}", path_or_repo_id).replaceAll("{revision}", encodeURIComponent(revision)),
7067
7416
  filename
7068
7417
  );
7069
- const proposedCacheKey = cache instanceof FileCache ? (
7418
+ const proposedCacheKey = cache2 instanceof FileCache ? (
7070
7419
  // Choose cache key for filesystem cache
7071
7420
  // When using the main revision (default), we use the request URL as the cache key.
7072
7421
  // If a specific revision is requested, we account for this in the cache key.
@@ -7080,14 +7429,14 @@ function buildResourcePaths(path_or_repo_id, filename, options = {}, cache = nul
7080
7429
  validModelId
7081
7430
  };
7082
7431
  }
7083
- async function checkCachedResource(cache, localPath, proposedCacheKey) {
7084
- if (!cache) {
7432
+ async function checkCachedResource(cache2, localPath, proposedCacheKey) {
7433
+ if (!cache2) {
7085
7434
  return void 0;
7086
7435
  }
7087
- return await tryCache(cache, localPath, proposedCacheKey);
7436
+ return await tryCache(cache2, localPath, proposedCacheKey);
7088
7437
  }
7089
- async function storeCachedResource(path_or_repo_id, filename, cache, cacheKey, response, result, options = {}) {
7090
- if (await cache.match(cacheKey) !== void 0) {
7438
+ async function storeCachedResource(path_or_repo_id, filename, cache2, cacheKey, response, result, options = {}) {
7439
+ if (await cache2.match(cacheKey) !== void 0) {
7091
7440
  return;
7092
7441
  }
7093
7442
  if (!result) {
@@ -7097,20 +7446,22 @@ async function storeCachedResource(path_or_repo_id, filename, cache, cacheKey, r
7097
7446
  file: filename,
7098
7447
  ...data
7099
7448
  }) : void 0;
7100
- await cache.put(
7449
+ await cache2.put(
7101
7450
  cacheKey,
7102
7451
  /** @type {Response} */
7103
7452
  response,
7104
7453
  wrapped_progress
7105
7454
  );
7106
7455
  } else if (typeof response !== "string") {
7107
- await cache.put(
7456
+ const headers = new Headers(response.headers);
7457
+ headers.set("content-length", result.byteLength.toString());
7458
+ await cache2.put(
7108
7459
  cacheKey,
7109
7460
  new Response(
7110
7461
  /** @type {any} */
7111
7462
  result,
7112
7463
  {
7113
- headers: response.headers
7464
+ headers
7114
7465
  }
7115
7466
  )
7116
7467
  ).catch((err) => {
@@ -7118,17 +7469,17 @@ async function storeCachedResource(path_or_repo_id, filename, cache, cacheKey, r
7118
7469
  });
7119
7470
  }
7120
7471
  }
7121
- async function loadResourceFile(path_or_repo_id, filename, fatal = true, options = {}, return_path = false, cache = null) {
7472
+ async function loadResourceFile(path_or_repo_id, filename, fatal = true, options = {}, return_path = false, cache2 = null) {
7122
7473
  const { requestURL, localPath, remoteURL, proposedCacheKey, validModelId } = buildResourcePaths(
7123
7474
  path_or_repo_id,
7124
7475
  filename,
7125
7476
  options,
7126
- cache
7477
+ cache2
7127
7478
  );
7128
7479
  let cacheKey;
7129
7480
  let toCacheResponse = false;
7130
7481
  let response;
7131
- response = await checkCachedResource(cache, localPath, proposedCacheKey);
7482
+ response = await checkCachedResource(cache2, localPath, proposedCacheKey);
7132
7483
  const cacheHit = response !== void 0;
7133
7484
  if (!cacheHit) {
7134
7485
  if (env.allowLocalModels) {
@@ -7169,7 +7520,7 @@ async function loadResourceFile(path_or_repo_id, filename, fatal = true, options
7169
7520
  }
7170
7521
  cacheKey = proposedCacheKey;
7171
7522
  }
7172
- toCacheResponse = cache && // 1. A caching system is available
7523
+ toCacheResponse = cache2 && // 1. A caching system is available
7173
7524
  typeof Response !== "undefined" && // 2. `Response` is defined (i.e., we are in a browser-like environment)
7174
7525
  response instanceof Response && // 3. result is a `Response` object (i.e., not a `FileResponse`)
7175
7526
  response.status === 200;
@@ -7231,7 +7582,7 @@ async function loadResourceFile(path_or_repo_id, filename, fatal = true, options
7231
7582
  // i.e., do not cache FileResponses (prevents duplication)
7232
7583
  toCacheResponse && cacheKey && typeof response !== "string"
7233
7584
  ) {
7234
- await storeCachedResource(path_or_repo_id, filename, cache, cacheKey, response, result, options);
7585
+ await storeCachedResource(path_or_repo_id, filename, cache2, cacheKey, response, result, options);
7235
7586
  }
7236
7587
  dispatchCallback(options.progress_callback, {
7237
7588
  status: "done",
@@ -7247,7 +7598,7 @@ async function loadResourceFile(path_or_repo_id, filename, fatal = true, options
7247
7598
  if (response instanceof FileResponse) {
7248
7599
  return response.filePath;
7249
7600
  }
7250
- const cachedResponse = await cache?.match(cacheKey);
7601
+ const cachedResponse = await cache2?.match(cacheKey);
7251
7602
  if (cachedResponse instanceof FileResponse) {
7252
7603
  return cachedResponse.filePath;
7253
7604
  } else if (cachedResponse instanceof Response) {
@@ -7274,8 +7625,8 @@ async function getModelFile(path_or_repo_id, filename, fatal = true, options = {
7274
7625
  name: path_or_repo_id,
7275
7626
  file: filename
7276
7627
  });
7277
- const cache = await getCache(options?.cache_dir);
7278
- return await loadResourceFile(path_or_repo_id, filename, fatal, options, return_path, cache);
7628
+ const cache2 = await getCache(options?.cache_dir);
7629
+ return await loadResourceFile(path_or_repo_id, filename, fatal, options, return_path, cache2);
7279
7630
  }
7280
7631
  async function getModelText(modelPath, fileName, fatal = true, options = {}) {
7281
7632
  const buffer = await getModelFile(modelPath, fileName, fatal, options, false);
@@ -8068,7 +8419,7 @@ var uint16_to_float32 = /* @__PURE__ */ (function() {
8068
8419
  // src/backends/onnx.js
8069
8420
  var ONNX_NODE = __toESM(require("onnxruntime-node"), 1);
8070
8421
 
8071
- // ../../node_modules/.pnpm/onnxruntime-web@1.25.0-dev.20260303-e7e64dc112/node_modules/onnxruntime-web/dist/ort.webgpu.bundle.min.mjs
8422
+ // ../../node_modules/.pnpm/onnxruntime-web@1.25.0-dev.20260307-d626b568e0/node_modules/onnxruntime-web/dist/ort.webgpu.bundle.min.mjs
8072
8423
  var ort_webgpu_bundle_min_exports = {};
8073
8424
  __export(ort_webgpu_bundle_min_exports, {
8074
8425
  InferenceSession: () => Jf,
@@ -8837,7 +9188,7 @@ async function ts(a = {}) {
8837
9188
  throw L(e = "Aborted(" + e + ")"), W = true, e = new WebAssembly.RuntimeError(e + ". Build with -sASSERTIONS for more info."), R?.(e), e;
8838
9189
  }
8839
9190
  function Ye() {
8840
- return { a: { f: Vs, J: js, k: Hs, p: Ys, l: qs, ta: Js, b: Xs, ca: Qs, Ka: Sn, s: Zs, da: Ln, _a: On, Ga: Bn, Ia: Mn, $a: Cn, Ya: Un, Ra: Dn, Xa: Pn, pa: _n, Ha: Rn, Yb: Nn, Za: kn, Fa: Wn, eb: Ks, Da: ti, Tb: ri, Rb: oi, Ca: si, M: ii, H: ui, Sb: fi, ka: yi, Ub: bi, Ua: wi, Wb: Ti, La: vi, Pb: Ei, la: Si, Ta: Ar, bb: Ai, U: Oi, n: Di, c: Er, sb: Pi, w: _i, L: Ri, z: Ni, j: ki, o: Yn, tb: Wi, G: Fi, T: Gi, g: $i, u: zi, m: Vi, i: ji, Oa: Hi, Pa: Yi, Qa: qi, Ma: Qn, Na: Zn, Qb: Kn, fb: Xi, db: Ki, Y: eu, rb: tu, ma: ru, cb: Qi, gb: nu, ab: ou, Xb: au, N: Ji, hb: su, X: iu, Vb: uu, ob: bu, C: wu, sa: gu, ra: Tu, qb: vu, W: Eu, v: Su, nb: Au, mb: Iu, lb: xu, pb: Lu, kb: Ou, jb: Bu, ib: Mu, Va: ao, Wa: so, Ja: br, ea: io, oa: uo, Sa: fo, na: co, Db: Gf, xa: Df, Eb: Ff, ya: Uf, F: Ef, e: ff, r: sf, x: af, D: gf, Ib: Bf, ba: Lf, B: df, za: Mf, $: Pf, ha: Of, Fb: kf, Gb: Nf, Ba: Sf, Aa: xf, Jb: Af, wa: Wf, aa: Cf, d: cf, A: lf, q: uf, Cb: $f, t: mf, y: Tf, I: pf, E: hf, K: vf, S: _f, ja: wf, _: Rf, Kb: bf, Lb: yf, P: If2, h: Uu, a: Fe, Ob: qe, Hb: Du, ia: Pu, O: _u, qa: Ru, Mb: Nu, Q: ku, zb: Wu, Ab: Fu, ua: Gu, fa: $u, R: zu, Ea: Vu, va: ju, Z: Hu, xb: Yu, Zb: qu, V: Ju, Bb: Xu, ub: Qu, vb: Ku, wb: ef, ga: tf, yb: rf, Nb: nf } };
9191
+ return { a: { f: Vs, J: js, k: Hs, p: Ys, l: qs, ta: Js, b: Xs, ca: Qs, Ka: Sn, s: Zs, da: Ln, _a: On, Ga: Bn, Ia: Mn, $a: Cn, Ya: Un, Ra: Dn, Xa: Pn, pa: _n, Ha: Rn, Yb: Nn, Za: kn, Fa: Wn, eb: Ks, Da: ti, Tb: ri, Rb: oi, Ca: si, M: ii, I: ui, Sb: fi, ka: yi, Ub: bi, Ua: wi, Wb: Ti, La: vi, Pb: Ei, la: Si, Ta: Ar, bb: Ai, U: Oi, n: Di, c: Er, sb: Pi, w: _i, L: Ri, z: Ni, j: ki, o: Yn, tb: Wi, G: Fi, T: Gi, h: $i, u: zi, m: Vi, i: ji, Oa: Hi, Pa: Yi, Qa: qi, Ma: Qn, Na: Zn, Qb: Kn, fb: Xi, db: Ki, Y: eu, rb: tu, ma: ru, cb: Qi, gb: nu, ab: ou, Xb: au, N: Ji, hb: su, X: iu, Vb: uu, ob: bu, C: wu, sa: gu, ra: Tu, qb: vu, W: Eu, v: Su, nb: Au, mb: Iu, lb: xu, pb: Lu, kb: Ou, jb: Bu, ib: Mu, Va: ao, Wa: so, Ja: br, ea: io, oa: uo, Sa: fo, na: co, Db: Gf, xa: Df, Eb: Ff, ya: Uf, F: Ef, e: ff, r: sf, x: af, D: gf, Ib: Bf, ba: Lf, B: df, za: Mf, $: Pf, ha: Of, Fb: kf, Gb: Nf, Ba: Sf, Aa: xf, Jb: Af, wa: Wf, aa: Cf, d: cf, A: lf, q: uf, Cb: $f, t: mf, y: Tf, H: pf, E: hf, K: vf, S: _f, ja: wf, _: Rf, Kb: bf, Lb: yf, P: If2, g: Uu, a: Fe, Ob: qe, Hb: Du, ia: Pu, O: _u, qa: Ru, Mb: Nu, Q: ku, zb: Wu, Ab: Fu, ua: Gu, fa: $u, R: zu, Ea: Vu, va: ju, Z: Hu, xb: Yu, Zb: qu, V: Ju, Bb: Xu, ub: Qu, vb: Ku, wb: ef, ga: tf, yb: rf, Nb: nf } };
8841
9192
  }
8842
9193
  async function bt() {
8843
9194
  function e(o, u) {
@@ -10024,7 +10375,7 @@ async function ts(a = {}) {
10024
10375
  Te(`invalid type for getValue: ${t}`);
10025
10376
  }
10026
10377
  }, r.UTF8ToString = ct, r.stringToUTF8 = Pe, r.lengthBytesUTF8 = _e;
10027
- var lo, po, Dr, Wt, xe, pt, mo, ho, yo, bo, wo, go, To, vo, Eo, So, Ao, Pr, _r, Rr, Nr, Et, kr, Io, Wr, xo, Lo, Oo, Fr, Bo, Mo, Gr, N, St, Co, D, Ft, P, Uo, $r, Do, Po, _o, zr, Ro, No, ko, Wo, Fo, Go, $o, zo, Vo, jo, Ho, Yo, qo, Jo, Xo, Qo, Zo, Ko, ea, ta, ra, na, oa, aa, sa, ia, ua, fa, ca, da, la, pa, ma, ha, ya, ba, wa, ga, Ta, ke, of = [qe, yr, En, Ln, On, Bn, Mn, Cn, Un, Dn, Pn, _n, Rn, Nn, kn, Wn, Qn, Zn, Kn, ao, so, io, uo, fo, co], Vr = { 923180: (e, t, n, o, u) => {
10378
+ var lo, po, Dr, Wt, xe, pt, mo, ho, yo, bo, wo, go, To, vo, Eo, So, Ao, Pr, _r, Rr, Nr, Et, kr, Io, Wr, xo, Lo, Oo, Fr, Bo, Mo, Gr, N, St, Co, D, Ft, P, Uo, $r, Do, Po, _o, zr, Ro, No, ko, Wo, Fo, Go, $o, zo, Vo, jo, Ho, Yo, qo, Jo, Xo, Qo, Zo, Ko, ea, ta, ra, na, oa, aa, sa, ia, ua, fa, ca, da, la, pa, ma, ha, ya, ba, wa, ga, Ta, ke, of = [qe, yr, En, Ln, On, Bn, Mn, Cn, Un, Dn, Pn, _n, Rn, Nn, kn, Wn, Qn, Zn, Kn, ao, so, io, uo, fo, co], Vr = { 925676: (e, t, n, o, u) => {
10028
10379
  if (r === void 0 || !r.Uc) return 1;
10029
10380
  if ((e = ct(Number(e >>> 0))).startsWith("./") && (e = e.substring(2)), !(e = r.Uc.get(e))) return 2;
10030
10381
  if (t = Number(t >>> 0), n = Number(n >>> 0), o = Number(o >>> 0), t + n > e.byteLength) return 3;
@@ -10044,11 +10395,11 @@ async function ts(a = {}) {
10044
10395
  } catch {
10045
10396
  return 4;
10046
10397
  }
10047
- }, 924004: (e, t, n) => {
10398
+ }, 926500: (e, t, n) => {
10048
10399
  r.Sd(e, (p(), J).subarray(t >>> 0, t + n >>> 0));
10049
- }, 924068: () => r.me(), 924110: (e) => {
10400
+ }, 926564: () => r.me(), 926606: (e) => {
10050
10401
  r.jd(e);
10051
- }, 924147: () => typeof wasmOffsetConverter < "u" };
10402
+ }, 926643: () => typeof wasmOffsetConverter < "u" };
10052
10403
  function af(e, t, n, o) {
10053
10404
  var u = P();
10054
10405
  try {
@@ -11964,7 +12315,7 @@ var $s = k(() => {
11964
12315
  Ve();
11965
12316
  Ve();
11966
12317
  Ve();
11967
- var Xa = "1.25.0-dev.20260303-e7e64dc112";
12318
+ var Xa = "1.25.0-dev.20260307-d626b568e0";
11968
12319
  var Tl = Zr;
11969
12320
  {
11970
12321
  let a = ($s(), $t(Gs)).wasmBackend;
@@ -11975,11 +12326,11 @@ Object.defineProperty(K.versions, "web", { value: Xa, enumerable: true });
11975
12326
  // src/backends/utils/cacheWasm.js
11976
12327
  async function loadAndCacheFile(url2) {
11977
12328
  const fileName = url2.split("/").pop();
11978
- let cache;
12329
+ let cache2;
11979
12330
  try {
11980
- cache = await getCache();
11981
- if (cache) {
11982
- const result = await cache.match(url2);
12331
+ cache2 = await getCache();
12332
+ if (cache2) {
12333
+ const result = await cache2.match(url2);
11983
12334
  if (result) {
11984
12335
  return result;
11985
12336
  }
@@ -11991,9 +12342,9 @@ async function loadAndCacheFile(url2) {
11991
12342
  if (!response.ok) {
11992
12343
  throw new Error(`Failed to fetch ${fileName}: ${response.status} ${response.statusText}`);
11993
12344
  }
11994
- if (cache) {
12345
+ if (cache2) {
11995
12346
  try {
11996
- await cache.put(url2, response.clone());
12347
+ await cache2.put(url2, response.clone());
11997
12348
  } catch (e) {
11998
12349
  logger.warn(`Failed to cache ${fileName}:`, e);
11999
12350
  }
@@ -13845,9 +14196,23 @@ var Tensor2 = class _Tensor {
13845
14196
  throw Error(`Unsupported norm: ${p}`);
13846
14197
  }
13847
14198
  const this_data = this.data;
13848
- const fn2 = (a, b) => a + b ** p;
14199
+ const is_bigint = this_data instanceof BigInt64Array || this_data instanceof BigUint64Array;
14200
+ if (is_bigint && p !== 1) {
14201
+ throw Error(`Expected a floating point tensor as input. Got ${this.type}`);
14202
+ }
14203
+ let fn2, zero;
14204
+ if (is_bigint) {
14205
+ fn2 = (a, b) => a + b;
14206
+ zero = 0n;
14207
+ } else {
14208
+ fn2 = (a, b) => a + b ** p;
14209
+ zero = 0;
14210
+ }
13849
14211
  if (dim === null) {
13850
- const val = this_data.reduce(fn2, 0) ** (1 / p);
14212
+ let val = this_data.reduce(fn2, zero);
14213
+ if (p !== 1) {
14214
+ val = val ** (1 / p);
14215
+ }
13851
14216
  return new _Tensor(this.type, [val], []);
13852
14217
  }
13853
14218
  const [type, result, resultDims] = reduce_helper(fn2, this, dim, keepdim);
@@ -16307,9 +16672,12 @@ __export(processors_exports, {
16307
16672
  ChatterboxProcessor: () => ChatterboxProcessor,
16308
16673
  Florence2Processor: () => Florence2Processor,
16309
16674
  Gemma3nProcessor: () => Gemma3nProcessor,
16675
+ Glm46VProcessor: () => Glm46VProcessor,
16676
+ GraniteSpeechProcessor: () => GraniteSpeechProcessor,
16310
16677
  GroundingDinoProcessor: () => GroundingDinoProcessor,
16311
16678
  Idefics3Processor: () => Idefics3Processor,
16312
16679
  JinaCLIPProcessor: () => JinaCLIPProcessor,
16680
+ Lfm2VlProcessor: () => Lfm2VlProcessor,
16313
16681
  LlavaProcessor: () => LlavaProcessor,
16314
16682
  MgpstrProcessor: () => MgpstrProcessor,
16315
16683
  MoonshineProcessor: () => MoonshineProcessor,
@@ -16330,6 +16698,7 @@ __export(processors_exports, {
16330
16698
  UltravoxProcessor: () => UltravoxProcessor,
16331
16699
  VLChatProcessor: () => VLChatProcessor,
16332
16700
  VoxtralProcessor: () => VoxtralProcessor,
16701
+ VoxtralRealtimeProcessor: () => VoxtralRealtimeProcessor,
16333
16702
  Wav2Vec2Processor: () => Wav2Vec2Processor,
16334
16703
  Wav2Vec2ProcessorWithLM: () => Wav2Vec2ProcessorWithLM,
16335
16704
  WhisperProcessor: () => WhisperProcessor
@@ -16384,12 +16753,14 @@ __export(feature_extractors_exports, {
16384
16753
  EncodecFeatureExtractor: () => EncodecFeatureExtractor,
16385
16754
  FeatureExtractor: () => FeatureExtractor,
16386
16755
  Gemma3nAudioFeatureExtractor: () => Gemma3nAudioFeatureExtractor,
16756
+ GraniteSpeechFeatureExtractor: () => GraniteSpeechFeatureExtractor,
16387
16757
  MoonshineFeatureExtractor: () => MoonshineFeatureExtractor,
16388
16758
  ParakeetFeatureExtractor: () => ParakeetFeatureExtractor,
16389
16759
  PyAnnoteFeatureExtractor: () => PyAnnoteFeatureExtractor,
16390
16760
  SeamlessM4TFeatureExtractor: () => SeamlessM4TFeatureExtractor,
16391
16761
  SnacFeatureExtractor: () => SnacFeatureExtractor,
16392
16762
  SpeechT5FeatureExtractor: () => SpeechT5FeatureExtractor,
16763
+ VoxtralRealtimeFeatureExtractor: () => VoxtralRealtimeFeatureExtractor,
16393
16764
  Wav2Vec2FeatureExtractor: () => Wav2Vec2FeatureExtractor,
16394
16765
  WeSpeakerFeatureExtractor: () => WeSpeakerFeatureExtractor,
16395
16766
  WhisperFeatureExtractor: () => WhisperFeatureExtractor
@@ -16617,6 +16988,7 @@ async function spectrogram(waveform, window2, frame_length, hop_length, {
16617
16988
  mel_filters = null,
16618
16989
  mel_floor = 1e-10,
16619
16990
  log_mel = null,
16991
+ max_log_mel = null,
16620
16992
  reference = 1,
16621
16993
  min_value = 1e-10,
16622
16994
  db_range = null,
@@ -16756,6 +17128,17 @@ async function spectrogram(waveform, window2, frame_length, hop_length, {
16756
17128
  mel_spec_data[i] = Math.log10(mel_spec_data[i]);
16757
17129
  }
16758
17130
  break;
17131
+ case "log10_max_norm": {
17132
+ for (let i = 0; i < o; ++i) {
17133
+ mel_spec_data[i] = Math.log10(mel_spec_data[i]);
17134
+ }
17135
+ const logMax = max_log_mel ?? max(mel_spec_data)[0];
17136
+ const threshold = logMax - 8;
17137
+ for (let i = 0; i < o; ++i) {
17138
+ mel_spec_data[i] = (Math.max(mel_spec_data[i], threshold) + 4) / 4;
17139
+ }
17140
+ break;
17141
+ }
16759
17142
  case "dB":
16760
17143
  if (power === 1) {
16761
17144
  amplitude_to_db(mel_spec_data, reference, min_value, db_range);
@@ -16766,7 +17149,9 @@ async function spectrogram(waveform, window2, frame_length, hop_length, {
16766
17149
  }
16767
17150
  break;
16768
17151
  default:
16769
- throw new Error(`log_mel must be one of null, 'log', 'log10' or 'dB'. Got '${log_mel}'`);
17152
+ throw new Error(
17153
+ `log_mel must be one of null, 'log', 'log10', 'log10_max_norm', or 'dB'. Got '${log_mel}'`
17154
+ );
16770
17155
  }
16771
17156
  }
16772
17157
  return mel_spec;
@@ -17271,6 +17656,56 @@ var Gemma3nAudioFeatureExtractor = class extends FeatureExtractor {
17271
17656
  }
17272
17657
  };
17273
17658
 
17659
+ // src/models/granite_speech/feature_extraction_granite_speech.js
17660
+ var GraniteSpeechFeatureExtractor = class extends FeatureExtractor {
17661
+ constructor(config) {
17662
+ super(config);
17663
+ const { n_fft, win_length, n_mels, sample_rate } = config.melspec_kwargs;
17664
+ this.mel_filters = mel_filter_bank(
17665
+ Math.floor(1 + n_fft / 2),
17666
+ // num_frequency_bins = 257
17667
+ n_mels,
17668
+ // 80
17669
+ 0,
17670
+ // min_frequency
17671
+ sample_rate / 2,
17672
+ // max_frequency = 8000
17673
+ sample_rate,
17674
+ // 16000
17675
+ null,
17676
+ // norm (torchaudio default: no norm)
17677
+ "htk"
17678
+ // mel_scale (torchaudio default)
17679
+ );
17680
+ const raw_window = window_function(win_length, "hann");
17681
+ this.window = new Float64Array(n_fft);
17682
+ const pad = Math.floor((n_fft - win_length) / 2);
17683
+ this.window.set(raw_window, pad);
17684
+ }
17685
+ /**
17686
+ * Extract mel spectrogram features from audio, matching the Python GraniteSpeechFeatureExtractor.
17687
+ * @param {Float32Array|Float64Array} audio The audio waveform.
17688
+ * @returns {Promise<{input_features: Tensor}>}
17689
+ */
17690
+ async _call(audio) {
17691
+ validate_audio_inputs(audio, "GraniteSpeechFeatureExtractor");
17692
+ const { n_fft, hop_length, n_mels } = this.config.melspec_kwargs;
17693
+ const num_frames = 1 + Math.floor((audio.length - 1) / hop_length);
17694
+ const max_num_frames = num_frames - num_frames % 2;
17695
+ const mel = await spectrogram(audio, this.window, n_fft, hop_length, {
17696
+ power: 2,
17697
+ mel_filters: this.mel_filters,
17698
+ log_mel: "log10_max_norm",
17699
+ transpose: true,
17700
+ // [time, n_mels]
17701
+ max_num_frames,
17702
+ do_pad: false
17703
+ });
17704
+ const input_features = mel.view(-1, 2 * n_mels).unsqueeze_(0);
17705
+ return { input_features };
17706
+ }
17707
+ };
17708
+
17274
17709
  // src/models/moonshine/feature_extraction_moonshine.js
17275
17710
  var MoonshineFeatureExtractor = class extends FeatureExtractor {
17276
17711
  /**
@@ -17751,6 +18186,71 @@ var WeSpeakerFeatureExtractor = class extends FeatureExtractor {
17751
18186
  }
17752
18187
  };
17753
18188
 
18189
+ // src/models/voxtral_realtime/feature_extraction_voxtral_realtime.js
18190
+ var VoxtralRealtimeFeatureExtractor = class extends FeatureExtractor {
18191
+ constructor(config) {
18192
+ super(config);
18193
+ this.config.mel_filters ??= mel_filter_bank(
18194
+ Math.floor(1 + this.config.n_fft / 2),
18195
+ // num_frequency_bins
18196
+ this.config.feature_size,
18197
+ // num_mel_filters
18198
+ 0,
18199
+ // min_frequency
18200
+ 8e3,
18201
+ // max_frequency
18202
+ this.config.sampling_rate,
18203
+ // sampling_rate
18204
+ "slaney",
18205
+ // norm
18206
+ "slaney"
18207
+ // mel_scale
18208
+ );
18209
+ this.window = window_function(this.config.n_fft, "hann");
18210
+ }
18211
+ /**
18212
+ * Computes the log-Mel spectrogram of the provided audio waveform.
18213
+ * @param {Float32Array|Float64Array} waveform The audio waveform to process.
18214
+ * @param {Object} [options]
18215
+ * @param {boolean} [options.center=true] Whether to center-pad the waveform for STFT.
18216
+ * @returns {Promise<import('../../utils/tensor.js').Tensor>} The log-Mel spectrogram tensor of shape [num_mel_bins, num_frames].
18217
+ */
18218
+ async _extract_fbank_features(waveform, { center = true } = {}) {
18219
+ const { n_fft, hop_length, mel_filters, global_log_mel_max } = this.config;
18220
+ const max_num_frames = center ? Math.floor(waveform.length / hop_length) : Math.floor((waveform.length - n_fft) / hop_length);
18221
+ return await spectrogram(
18222
+ waveform,
18223
+ this.window,
18224
+ n_fft,
18225
+ // frame_length
18226
+ hop_length,
18227
+ {
18228
+ power: 2,
18229
+ mel_filters,
18230
+ log_mel: "log10_max_norm",
18231
+ max_log_mel: global_log_mel_max,
18232
+ center,
18233
+ max_num_frames,
18234
+ do_pad: false
18235
+ }
18236
+ );
18237
+ }
18238
+ /**
18239
+ * Extract mel spectrogram features from audio.
18240
+ * @param {Float32Array|Float64Array} audio The audio data.
18241
+ * @param {Object} [options]
18242
+ * @param {boolean} [options.center=true] Whether to center-pad the waveform.
18243
+ * @returns {Promise<{ input_features: import('../../utils/tensor.js').Tensor }>}
18244
+ */
18245
+ async _call(audio, { center = true } = {}) {
18246
+ validate_audio_inputs(audio, "VoxtralRealtimeFeatureExtractor");
18247
+ const features = await this._extract_fbank_features(audio, { center });
18248
+ return {
18249
+ input_features: features.unsqueeze_(0)
18250
+ };
18251
+ }
18252
+ };
18253
+
17754
18254
  // src/models/whisper/feature_extraction_whisper.js
17755
18255
  var WhisperFeatureExtractor = class extends FeatureExtractor {
17756
18256
  constructor(config) {
@@ -17779,7 +18279,7 @@ var WhisperFeatureExtractor = class extends FeatureExtractor {
17779
18279
  * @returns {Promise<Tensor>} An object containing the log-Mel spectrogram data as a Float32Array and its dimensions as an array of numbers.
17780
18280
  */
17781
18281
  async _extract_fbank_features(waveform) {
17782
- const features = await spectrogram(
18282
+ return await spectrogram(
17783
18283
  waveform,
17784
18284
  this.window,
17785
18285
  // window
@@ -17790,7 +18290,7 @@ var WhisperFeatureExtractor = class extends FeatureExtractor {
17790
18290
  {
17791
18291
  power: 2,
17792
18292
  mel_filters: this.config.mel_filters,
17793
- log_mel: "log10",
18293
+ log_mel: "log10_max_norm",
17794
18294
  // Custom
17795
18295
  max_num_frames: Math.min(
17796
18296
  Math.floor(waveform.length / this.config.hop_length),
@@ -17799,15 +18299,6 @@ var WhisperFeatureExtractor = class extends FeatureExtractor {
17799
18299
  )
17800
18300
  }
17801
18301
  );
17802
- const data = features.data;
17803
- const maxValue = max(
17804
- /** @type {Float32Array} */
17805
- data
17806
- )[0];
17807
- for (let i = 0; i < data.length; ++i) {
17808
- data[i] = (Math.max(data[i], maxValue - 8) + 4) / 4;
17809
- }
17810
- return features;
17811
18302
  }
17812
18303
  /**
17813
18304
  * Asynchronously extracts features from a given audio using the provided configuration.
@@ -18686,6 +19177,30 @@ function compute_segments(mask_probs, pred_scores, pred_labels, mask_threshold,
18686
19177
  }
18687
19178
  return [segmentation, segments];
18688
19179
  }
19180
+ function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280, temporal_factor = 1) {
19181
+ if (height < factor || width < factor) {
19182
+ const scale = Math.max(factor / height, factor / width);
19183
+ height = Math.round(height * scale);
19184
+ width = Math.round(width * scale);
19185
+ }
19186
+ if (Math.max(height, width) / Math.min(height, width) > 200) {
19187
+ throw new Error(
19188
+ `absolute aspect ratio must be smaller than 200, got ${Math.max(height, width) / Math.min(height, width)}`
19189
+ );
19190
+ }
19191
+ let h_bar = Math.round(height / factor) * factor;
19192
+ let w_bar = Math.round(width / factor) * factor;
19193
+ if (temporal_factor * h_bar * w_bar > max_pixels) {
19194
+ const beta = Math.sqrt(temporal_factor * height * width / max_pixels);
19195
+ h_bar = Math.max(factor, Math.floor(height / beta / factor) * factor);
19196
+ w_bar = Math.max(factor, Math.floor(width / beta / factor) * factor);
19197
+ } else if (temporal_factor * h_bar * w_bar < min_pixels) {
19198
+ const beta = Math.sqrt(min_pixels / (temporal_factor * height * width));
19199
+ h_bar = Math.ceil(height * beta / factor) * factor;
19200
+ w_bar = Math.ceil(width * beta / factor) * factor;
19201
+ }
19202
+ return [w_bar, h_bar];
19203
+ }
18689
19204
  function post_process_panoptic_segmentation(outputs, threshold = 0.5, mask_threshold = 0.5, overlap_mask_area_threshold = 0.8, label_ids_to_fuse = null, target_sizes = null) {
18690
19205
  if (label_ids_to_fuse === null) {
18691
19206
  logger.warn("`label_ids_to_fuse` unset. No instance will be fused.");
@@ -18763,7 +19278,7 @@ var ImageProcessor = class extends Callable2 {
18763
19278
  this.do_pad = config.do_pad;
18764
19279
  this.min_pixels = config.min_pixels;
18765
19280
  this.max_pixels = config.max_pixels;
18766
- if (this.do_pad && !this.pad_size && this.size && this.size.width !== void 0 && this.size.height !== void 0) {
19281
+ if (this.do_pad && !this.pad_size && !this.size_divisibility && this.size && this.size.width !== void 0 && this.size.height !== void 0) {
18767
19282
  this.pad_size = this.size;
18768
19283
  }
18769
19284
  this.do_flip_channel_order = config.do_flip_channel_order ?? false;
@@ -18974,7 +19489,7 @@ var ImageProcessor = class extends Callable2 {
18974
19489
  });
18975
19490
  }
18976
19491
  /**
18977
- * @typedef {object} PreprocessedImage
19492
+ * @typedef {Object} PreprocessedImage
18978
19493
  * @property {HeightWidth} original_size The original size of the image.
18979
19494
  * @property {HeightWidth} reshaped_input_size The reshaped input size of the image.
18980
19495
  * @property {Tensor} pixel_values The pixel values of the preprocessed image.
@@ -19051,10 +19566,8 @@ var ImageProcessor = class extends Callable2 {
19051
19566
  const padded = this.pad_image(pixelData, [image.height, image.width, image.channels], this.pad_size);
19052
19567
  [pixelData, imgDims] = padded;
19053
19568
  } else if (this.size_divisibility) {
19054
- const [paddedWidth, paddedHeight] = enforce_size_divisibility(
19055
- [imgDims[1], imgDims[0]],
19056
- this.size_divisibility
19057
- );
19569
+ const paddedWidth = Math.ceil(imgDims[1] / this.size_divisibility) * this.size_divisibility;
19570
+ const paddedHeight = Math.ceil(imgDims[0] / this.size_divisibility) * this.size_divisibility;
19058
19571
  [pixelData, imgDims] = this.pad_image(pixelData, imgDims, { width: paddedWidth, height: paddedHeight });
19059
19572
  }
19060
19573
  }
@@ -19131,6 +19644,7 @@ var image_processors_exports = {};
19131
19644
  __export(image_processors_exports, {
19132
19645
  BeitFeatureExtractor: () => BeitFeatureExtractor,
19133
19646
  BitImageProcessor: () => BitImageProcessor,
19647
+ CHMv2ImageProcessor: () => CHMv2ImageProcessor,
19134
19648
  CLIPFeatureExtractor: () => CLIPFeatureExtractor,
19135
19649
  CLIPImageProcessor: () => CLIPImageProcessor,
19136
19650
  ChineseCLIPFeatureExtractor: () => ChineseCLIPFeatureExtractor,
@@ -19147,11 +19661,13 @@ __export(image_processors_exports, {
19147
19661
  DonutImageProcessor: () => DonutImageProcessor,
19148
19662
  EfficientNetImageProcessor: () => EfficientNetImageProcessor,
19149
19663
  GLPNFeatureExtractor: () => GLPNFeatureExtractor,
19664
+ Glm46VImageProcessor: () => Glm46VImageProcessor,
19150
19665
  GroundingDinoImageProcessor: () => GroundingDinoImageProcessor,
19151
19666
  Idefics3ImageProcessor: () => Idefics3ImageProcessor,
19152
19667
  ImageFeatureExtractor: () => ImageProcessor,
19153
19668
  ImageProcessor: () => ImageProcessor,
19154
19669
  JinaCLIPImageProcessor: () => JinaCLIPImageProcessor,
19670
+ Lfm2VlImageProcessor: () => Lfm2VlImageProcessor,
19155
19671
  LlavaOnevisionImageProcessor: () => LlavaOnevisionImageProcessor,
19156
19672
  Mask2FormerImageProcessor: () => Mask2FormerImageProcessor,
19157
19673
  MaskFormerFeatureExtractor: () => MaskFormerFeatureExtractor,
@@ -19206,6 +19722,10 @@ var BitImageProcessor = class extends ImageProcessor {
19206
19722
  var ChineseCLIPFeatureExtractor = class extends ImageProcessor {
19207
19723
  };
19208
19724
 
19725
+ // src/models/chmv2/image_processing_chmv2.js
19726
+ var CHMv2ImageProcessor = class extends ImageProcessor {
19727
+ };
19728
+
19209
19729
  // src/models/clip/image_processing_clip.js
19210
19730
  var CLIPImageProcessor = class extends ImageProcessor {
19211
19731
  };
@@ -19325,6 +19845,65 @@ var EfficientNetImageProcessor = class extends ImageProcessor {
19325
19845
  }
19326
19846
  };
19327
19847
 
19848
+ // src/models/qwen2_vl/image_processing_qwen2_vl.js
19849
+ var Qwen2VLImageProcessor = class extends ImageProcessor {
19850
+ constructor(config) {
19851
+ super(config);
19852
+ this.min_pixels = config.min_pixels ?? config.size?.shortest_edge;
19853
+ this.max_pixels = config.max_pixels ?? config.size?.longest_edge;
19854
+ this.patch_size = config.patch_size;
19855
+ this.merge_size = config.merge_size;
19856
+ }
19857
+ /** @type {ImageProcessor['get_resize_output_image_size']} */
19858
+ get_resize_output_image_size(image, size) {
19859
+ const factor = this.patch_size * this.merge_size;
19860
+ return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels);
19861
+ }
19862
+ async _call(images, ...args) {
19863
+ const { pixel_values, original_sizes, reshaped_input_sizes } = await super._call(images, ...args);
19864
+ let patches = pixel_values;
19865
+ const { temporal_patch_size, merge_size, patch_size } = this.config;
19866
+ if (patches.dims[0] === 1) {
19867
+ patches = cat(
19868
+ Array.from({ length: temporal_patch_size }, () => patches),
19869
+ 0
19870
+ );
19871
+ }
19872
+ const grid_t = patches.dims[0] / temporal_patch_size;
19873
+ const channel = patches.dims[1];
19874
+ const grid_h = Math.floor(patches.dims[2] / patch_size);
19875
+ const grid_w = Math.floor(patches.dims[3] / patch_size);
19876
+ const flatten_patches = patches.view(
19877
+ grid_t,
19878
+ temporal_patch_size,
19879
+ channel,
19880
+ Math.floor(grid_h / merge_size),
19881
+ merge_size,
19882
+ patch_size,
19883
+ Math.floor(grid_w / merge_size),
19884
+ merge_size,
19885
+ patch_size
19886
+ ).permute(0, 3, 6, 4, 7, 2, 1, 5, 8).view(grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size);
19887
+ const image_grid_thw = new Tensor2("int64", [grid_t, grid_h, grid_w], [1, 3]);
19888
+ return {
19889
+ pixel_values: flatten_patches,
19890
+ image_grid_thw,
19891
+ original_sizes,
19892
+ reshaped_input_sizes
19893
+ };
19894
+ }
19895
+ };
19896
+
19897
+ // src/models/glm46v/image_processing_glm46v.js
19898
+ var Glm46VImageProcessor = class extends Qwen2VLImageProcessor {
19899
+ /** @type {Qwen2VLImageProcessor['get_resize_output_image_size']} */
19900
+ get_resize_output_image_size(image, size) {
19901
+ const factor = this.patch_size * this.merge_size;
19902
+ const temporal_factor = this.config.temporal_patch_size ?? 2;
19903
+ return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels, temporal_factor);
19904
+ }
19905
+ };
19906
+
19328
19907
  // src/models/glpn/image_processing_glpn.js
19329
19908
  var GLPNFeatureExtractor = class extends ImageProcessor {
19330
19909
  };
@@ -19555,6 +20134,237 @@ var JinaCLIPImageProcessor = class extends ImageProcessor {
19555
20134
  }
19556
20135
  };
19557
20136
 
20137
+ // src/models/lfm2_vl/image_processing_lfm2_vl.js
20138
+ function round_by_factor(number, factor) {
20139
+ return Math.round(number / factor) * factor;
20140
+ }
20141
+ function find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size) {
20142
+ let best_ratio_diff = Infinity;
20143
+ let best_ratio = [1, 1];
20144
+ const area = width * height;
20145
+ for (const ratio of target_ratios) {
20146
+ const ratio_diff = Math.abs(aspect_ratio - ratio[0] / ratio[1]);
20147
+ if (ratio_diff < best_ratio_diff) {
20148
+ best_ratio_diff = ratio_diff;
20149
+ best_ratio = ratio;
20150
+ } else if (ratio_diff === best_ratio_diff && area > 0.5 * image_size * image_size * ratio[0] * ratio[1]) {
20151
+ best_ratio = ratio;
20152
+ }
20153
+ }
20154
+ return best_ratio;
20155
+ }
20156
+ function get_target_ratios(min_tiles, max_tiles) {
20157
+ const ratios = [];
20158
+ const seen = /* @__PURE__ */ new Set();
20159
+ for (let n = min_tiles; n <= max_tiles; ++n) {
20160
+ for (let w = 1; w <= n; ++w) {
20161
+ for (let h = 1; h <= n; ++h) {
20162
+ const product2 = w * h;
20163
+ if (product2 >= min_tiles && product2 <= max_tiles) {
20164
+ const key = w << 16 | h;
20165
+ if (!seen.has(key)) {
20166
+ seen.add(key);
20167
+ ratios.push([w, h]);
20168
+ }
20169
+ }
20170
+ }
20171
+ }
20172
+ }
20173
+ return ratios.sort((a, b) => a[0] * a[1] - b[0] * b[1]);
20174
+ }
20175
+ function convert_image_to_patches(images, patch_size) {
20176
+ const [B, C, H, W] = images.dims;
20177
+ const ph = Math.floor(H / patch_size), pw = Math.floor(W / patch_size);
20178
+ const patch_dim = patch_size * patch_size * C;
20179
+ const data = (
20180
+ /** @type {Float32Array} */
20181
+ images.data
20182
+ );
20183
+ const result = new Float32Array(B * ph * pw * patch_dim);
20184
+ const ch_stride = H * W;
20185
+ for (let b = 0; b < B; ++b) {
20186
+ const b_src = b * C * ch_stride;
20187
+ const b_dst = b * ph * pw * patch_dim;
20188
+ for (let py = 0; py < ph; ++py) {
20189
+ for (let px = 0; px < pw; ++px) {
20190
+ let off = b_dst + (py * pw + px) * patch_dim;
20191
+ for (let dy = 0; dy < patch_size; ++dy) {
20192
+ const row = (py * patch_size + dy) * W + px * patch_size;
20193
+ for (let dx = 0; dx < patch_size; ++dx) {
20194
+ const pixel = row + dx;
20195
+ for (let c = 0; c < C; ++c) {
20196
+ result[off++] = data[b_src + c * ch_stride + pixel];
20197
+ }
20198
+ }
20199
+ }
20200
+ }
20201
+ }
20202
+ }
20203
+ return new Tensor2("float32", result, [B, ph * pw, patch_dim]);
20204
+ }
20205
+ function pad_along_first_dim(patches, target_length) {
20206
+ const [, len2, dim] = patches.dims;
20207
+ const mask_data = new BigInt64Array(target_length);
20208
+ mask_data.fill(1n, 0, len2);
20209
+ let padded = patches;
20210
+ if (len2 < target_length) {
20211
+ const padded_data = new Float32Array(target_length * dim);
20212
+ padded_data.set(
20213
+ /** @type {Float32Array} */
20214
+ patches.data
20215
+ );
20216
+ padded = new Tensor2("float32", padded_data, [1, target_length, dim]);
20217
+ }
20218
+ return { padded, mask: new Tensor2("int64", mask_data, [target_length]) };
20219
+ }
20220
+ var Lfm2VlImageProcessor = class extends ImageProcessor {
20221
+ constructor(config) {
20222
+ super(config);
20223
+ this.downsample_factor = config.downsample_factor ?? 2;
20224
+ this.do_image_splitting = config.do_image_splitting ?? true;
20225
+ this.min_tiles = config.min_tiles ?? 2;
20226
+ this.max_tiles = config.max_tiles ?? 10;
20227
+ this.use_thumbnail = config.use_thumbnail ?? true;
20228
+ this.min_image_tokens = config.min_image_tokens ?? 64;
20229
+ this.max_image_tokens = config.max_image_tokens ?? 256;
20230
+ this.encoder_patch_size = config.encoder_patch_size ?? config.patch_size ?? 16;
20231
+ this.tile_size = config.tile_size ?? 512;
20232
+ this.max_pixels_tolerance = config.max_pixels_tolerance ?? 2;
20233
+ this.return_row_col_info = config.return_row_col_info ?? false;
20234
+ const max_thumbnail_patches = this.max_image_tokens * this.downsample_factor ** 2;
20235
+ const tile_size_patches = this.do_image_splitting ? (this.tile_size / this.encoder_patch_size) ** 2 : 0;
20236
+ this.max_num_patches = Math.max(max_thumbnail_patches, tile_size_patches);
20237
+ }
20238
+ /**
20239
+ * Check if the image is too large to be processed as a single tile.
20240
+ * @param {number} height
20241
+ * @param {number} width
20242
+ * @returns {boolean}
20243
+ */
20244
+ _is_image_too_large(height, width) {
20245
+ const total_factor = this.encoder_patch_size * this.downsample_factor;
20246
+ const h_bar = Math.max(this.encoder_patch_size, round_by_factor(height, total_factor));
20247
+ const w_bar = Math.max(this.encoder_patch_size, round_by_factor(width, total_factor));
20248
+ return h_bar * w_bar > this.max_image_tokens * (this.encoder_patch_size * this.downsample_factor) ** 2 * this.max_pixels_tolerance;
20249
+ }
20250
+ /**
20251
+ * Get the grid layout for tiling a large image.
20252
+ * @param {number} height
20253
+ * @param {number} width
20254
+ * @returns {{ grid_width: number, grid_height: number, target_width: number, target_height: number }}
20255
+ */
20256
+ _get_grid_layout(height, width) {
20257
+ const target_ratios = get_target_ratios(this.min_tiles, this.max_tiles);
20258
+ const [grid_width, grid_height] = find_closest_aspect_ratio(
20259
+ width / height,
20260
+ target_ratios,
20261
+ width,
20262
+ height,
20263
+ this.tile_size
20264
+ );
20265
+ return {
20266
+ grid_width,
20267
+ grid_height,
20268
+ target_width: this.tile_size * grid_width,
20269
+ target_height: this.tile_size * grid_height
20270
+ };
20271
+ }
20272
+ /** @param {RawImage|RawImage[]|RawImage[][]} images */
20273
+ // @ts-expect-error
20274
+ async _call(images, { return_row_col_info = null } = {}) {
20275
+ let batched_images;
20276
+ if (!Array.isArray(images)) {
20277
+ batched_images = [[images]];
20278
+ } else if (!Array.isArray(images[0])) {
20279
+ batched_images = [
20280
+ /** @type {RawImage[]} */
20281
+ images
20282
+ ];
20283
+ } else {
20284
+ batched_images = /** @type {RawImage[][]} */
20285
+ images;
20286
+ }
20287
+ const all_pixel_values = [];
20288
+ const all_pixel_masks = [];
20289
+ const all_spatial_shapes = [];
20290
+ const all_rows = [];
20291
+ const all_cols = [];
20292
+ const all_image_sizes = [];
20293
+ for (const image_batch of batched_images) {
20294
+ const preprocessed = await Promise.all(image_batch.map((x) => this.preprocess(x, { do_pad: false })));
20295
+ for (const { pixel_values } of preprocessed) {
20296
+ const [, height, width] = pixel_values.dims;
20297
+ const img = pixel_values.unsqueeze_(0);
20298
+ const total_factor = this.encoder_patch_size * this.downsample_factor;
20299
+ const f2 = total_factor ** 2;
20300
+ const [new_width, new_height] = smart_resize(
20301
+ Math.max(total_factor, height),
20302
+ Math.max(total_factor, width),
20303
+ total_factor,
20304
+ this.min_image_tokens * f2,
20305
+ this.max_image_tokens * f2
20306
+ ).map((x) => Math.max(total_factor, x));
20307
+ let tiles;
20308
+ let num_rows = 1, num_cols = 1;
20309
+ const is_large = this._is_image_too_large(height, width);
20310
+ const do_splitting = this.do_image_splitting && !(this.min_tiles === 1 && this.max_tiles === 1);
20311
+ if (is_large && do_splitting) {
20312
+ const { grid_width, grid_height, target_width, target_height } = this._get_grid_layout(
20313
+ height,
20314
+ width
20315
+ );
20316
+ num_rows = grid_height;
20317
+ num_cols = grid_width;
20318
+ const resized = await interpolate_4d(img, {
20319
+ size: [target_height, target_width]
20320
+ });
20321
+ tiles = [];
20322
+ for (let r = 0; r < grid_height; ++r) {
20323
+ for (let c = 0; c < grid_width; ++c) {
20324
+ const y = r * this.tile_size;
20325
+ const x = c * this.tile_size;
20326
+ tiles.push(resized.slice(null, null, [y, y + this.tile_size], [x, x + this.tile_size]));
20327
+ }
20328
+ }
20329
+ if (this.use_thumbnail && grid_width * grid_height !== 1) {
20330
+ tiles.push(await interpolate_4d(img, { size: [new_height, new_width] }));
20331
+ }
20332
+ } else {
20333
+ tiles = [await interpolate_4d(img, { size: [new_height, new_width] })];
20334
+ }
20335
+ for (const tile of tiles) {
20336
+ const [, , th, tw] = tile.dims;
20337
+ const patches = convert_image_to_patches(tile, this.encoder_patch_size);
20338
+ const { padded, mask } = pad_along_first_dim(patches, this.max_num_patches);
20339
+ all_pixel_values.push(padded);
20340
+ all_pixel_masks.push(mask);
20341
+ all_spatial_shapes.push([
20342
+ Math.floor(th / this.encoder_patch_size),
20343
+ Math.floor(tw / this.encoder_patch_size)
20344
+ ]);
20345
+ }
20346
+ all_rows.push(num_rows);
20347
+ all_cols.push(num_cols);
20348
+ all_image_sizes.push([new_height, new_width]);
20349
+ }
20350
+ }
20351
+ const result = {
20352
+ pixel_values: cat(all_pixel_values, 0),
20353
+ pixel_attention_mask: stack(all_pixel_masks, 0),
20354
+ spatial_shapes: new Tensor2("int64", BigInt64Array.from(all_spatial_shapes.flat(), BigInt), [
20355
+ all_spatial_shapes.length,
20356
+ 2
20357
+ ])
20358
+ };
20359
+ if (return_row_col_info ?? this.return_row_col_info) {
20360
+ result.image_rows = all_rows;
20361
+ result.image_cols = all_cols;
20362
+ result.image_sizes = all_image_sizes;
20363
+ }
20364
+ return result;
20365
+ }
20366
+ };
20367
+
19558
20368
  // src/models/llava_onevision/image_processing_llava_onevision.js
19559
20369
  var LlavaOnevisionImageProcessor = class extends ImageProcessor {
19560
20370
  };
@@ -19777,76 +20587,6 @@ var PixtralImageProcessor = class extends ImageProcessor {
19777
20587
  var PvtImageProcessor = class extends ImageProcessor {
19778
20588
  };
19779
20589
 
19780
- // src/models/qwen2_vl/image_processing_qwen2_vl.js
19781
- function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280) {
19782
- if (height < factor || width < factor) {
19783
- throw new Error(`height:${height} or width:${width} must be larger than factor:${factor}`);
19784
- } else if (Math.max(height, width) / Math.min(height, width) > 200) {
19785
- throw new Error(
19786
- `absolute aspect ratio must be smaller than 200, got ${Math.max(height, width) / Math.min(height, width)}`
19787
- );
19788
- }
19789
- let h_bar = Math.round(height / factor) * factor;
19790
- let w_bar = Math.round(width / factor) * factor;
19791
- if (h_bar * w_bar > max_pixels) {
19792
- const beta = Math.sqrt(height * width / max_pixels);
19793
- h_bar = Math.floor(height / beta / factor) * factor;
19794
- w_bar = Math.floor(width / beta / factor) * factor;
19795
- } else if (h_bar * w_bar < min_pixels) {
19796
- const beta = Math.sqrt(min_pixels / (height * width));
19797
- h_bar = Math.ceil(height * beta / factor) * factor;
19798
- w_bar = Math.ceil(width * beta / factor) * factor;
19799
- }
19800
- return [h_bar, w_bar];
19801
- }
19802
- var Qwen2VLImageProcessor = class extends ImageProcessor {
19803
- constructor(config) {
19804
- super(config);
19805
- this.min_pixels = config.min_pixels ?? config.size?.shortest_edge;
19806
- this.max_pixels = config.max_pixels ?? config.size?.longest_edge;
19807
- this.patch_size = config.patch_size;
19808
- this.merge_size = config.merge_size;
19809
- }
19810
- /** @type {ImageProcessor['get_resize_output_image_size']} */
19811
- get_resize_output_image_size(image, size) {
19812
- const factor = this.patch_size * this.merge_size;
19813
- return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels);
19814
- }
19815
- async _call(images, ...args) {
19816
- const { pixel_values, original_sizes, reshaped_input_sizes } = await super._call(images, ...args);
19817
- let patches = pixel_values;
19818
- const { temporal_patch_size, merge_size, patch_size } = this.config;
19819
- if (patches.dims[0] === 1) {
19820
- patches = cat(
19821
- Array.from({ length: temporal_patch_size }, () => patches),
19822
- 0
19823
- );
19824
- }
19825
- const grid_t = patches.dims[0] / temporal_patch_size;
19826
- const channel = patches.dims[1];
19827
- const grid_h = Math.floor(patches.dims[2] / patch_size);
19828
- const grid_w = Math.floor(patches.dims[3] / patch_size);
19829
- const flatten_patches = patches.view(
19830
- grid_t,
19831
- temporal_patch_size,
19832
- channel,
19833
- Math.floor(grid_h / merge_size),
19834
- merge_size,
19835
- patch_size,
19836
- Math.floor(grid_w / merge_size),
19837
- merge_size,
19838
- patch_size
19839
- ).permute(0, 3, 6, 4, 7, 2, 1, 5, 8).view(grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size);
19840
- const image_grid_thw = new Tensor2("int64", [grid_t, grid_h, grid_w], [1, 3]);
19841
- return {
19842
- pixel_values: flatten_patches,
19843
- image_grid_thw,
19844
- original_sizes,
19845
- reshaped_input_sizes
19846
- };
19847
- }
19848
- };
19849
-
19850
20590
  // src/models/rt_detr/image_processing_rt_detr.js
19851
20591
  var RTDetrImageProcessor = class extends ImageProcessor {
19852
20592
  /** @type {typeof post_process_object_detection} */
@@ -20400,6 +21140,107 @@ ${boi_token}${image_tokens_expanded}${eoi_token}
20400
21140
  }
20401
21141
  };
20402
21142
 
21143
+ // src/models/qwen2_vl/processing_qwen2_vl.js
21144
+ var Qwen2VLProcessor = class extends Processor {
21145
+ static image_processor_class = AutoImageProcessor;
21146
+ static tokenizer_class = AutoTokenizer;
21147
+ static image_token = "<|image_pad|>";
21148
+ /**
21149
+ *
21150
+ * @param {string|string[]} text
21151
+ * @param {RawImage|RawImage[]} images
21152
+ * @param {...any} args
21153
+ * @returns {Promise<any>}
21154
+ */
21155
+ async _call(text, images = null, ...args) {
21156
+ if (!Array.isArray(text)) {
21157
+ text = [text];
21158
+ }
21159
+ let image_inputs, image_grid_thw;
21160
+ if (images) {
21161
+ image_inputs = await this.image_processor(images);
21162
+ image_grid_thw = image_inputs.image_grid_thw;
21163
+ }
21164
+ if (image_grid_thw) {
21165
+ let merge_length = this.image_processor.config.merge_size ** 2;
21166
+ let index = 0;
21167
+ const image_token = (
21168
+ /** @type {typeof Qwen2VLProcessor} */
21169
+ this.constructor.image_token
21170
+ );
21171
+ const image_grid_thw_list = image_grid_thw.tolist();
21172
+ text = text.map((t) => {
21173
+ while (t.includes(image_token)) {
21174
+ const prod = Number(image_grid_thw_list[index++].reduce((a, b) => a * b, 1n));
21175
+ t = t.replace(image_token, "<|placeholder|>".repeat(Math.floor(prod / merge_length)));
21176
+ }
21177
+ return t.replaceAll("<|placeholder|>", image_token);
21178
+ });
21179
+ }
21180
+ const text_inputs = this.tokenizer(text);
21181
+ return {
21182
+ ...text_inputs,
21183
+ ...image_inputs
21184
+ };
21185
+ }
21186
+ };
21187
+
21188
+ // src/models/glm46v/processing_glm46v.js
21189
+ var Glm46VProcessor = class extends Qwen2VLProcessor {
21190
+ static image_token = "<|image|>";
21191
+ };
21192
+
21193
+ // src/models/granite_speech/processing_granite_speech.js
21194
+ var GraniteSpeechProcessor = class extends Processor {
21195
+ static tokenizer_class = AutoTokenizer;
21196
+ static feature_extractor_class = AutoFeatureExtractor;
21197
+ static uses_processor_config = true;
21198
+ /**
21199
+ * Compute the number of audio tokens for a given raw audio length.
21200
+ * @param {number} audioLength Raw audio sample count.
21201
+ * @returns {number} Number of projector output tokens.
21202
+ */
21203
+ _get_num_audio_features(audioLength) {
21204
+ const { hop_length } = this.feature_extractor.config.melspec_kwargs;
21205
+ const { projector_window_size, projector_downsample_rate } = this.feature_extractor.config;
21206
+ const effective_window_size = Math.floor(projector_window_size / projector_downsample_rate);
21207
+ const mel_length = Math.floor(audioLength / hop_length) + 1;
21208
+ const encoder_length = Math.floor(mel_length / 2);
21209
+ const nblocks = Math.ceil(encoder_length / projector_window_size);
21210
+ return nblocks * effective_window_size;
21211
+ }
21212
+ /**
21213
+ * @param {string} text The text input to process.
21214
+ * @param {Float32Array} audio The audio input to process.
21215
+ */
21216
+ async _call(text, audio = null, kwargs = {}) {
21217
+ if (Array.isArray(text)) {
21218
+ throw new Error("Batched inputs are not supported yet.");
21219
+ }
21220
+ let audio_inputs = {};
21221
+ if (audio) {
21222
+ const { input_features } = await this.feature_extractor(audio);
21223
+ audio_inputs["input_features"] = input_features;
21224
+ const audio_embed_size = this._get_num_audio_features(audio.length);
21225
+ const mask_data = new Uint8Array(audio_embed_size).fill(1);
21226
+ audio_inputs["input_features_mask"] = new Tensor2("bool", mask_data, [1, audio_embed_size]);
21227
+ const audio_token = this.config.audio_token ?? "<|audio|>";
21228
+ if (!text.includes(audio_token)) {
21229
+ throw new Error(`The input text does not contain the audio token ${audio_token}.`);
21230
+ }
21231
+ text = text.replaceAll(audio_token, audio_token.repeat(audio_embed_size));
21232
+ }
21233
+ const text_inputs = this.tokenizer(text, {
21234
+ add_special_tokens: false,
21235
+ ...kwargs
21236
+ });
21237
+ return {
21238
+ ...text_inputs,
21239
+ ...audio_inputs
21240
+ };
21241
+ }
21242
+ };
21243
+
20403
21244
  // src/models/grounding_dino/processing_grounding_dino.js
20404
21245
  function get_phrases_from_posmap(posmaps, input_ids) {
20405
21246
  const left_idx = 0;
@@ -20676,6 +21517,66 @@ var JinaCLIPProcessor = class extends Processor {
20676
21517
  }
20677
21518
  };
20678
21519
 
21520
+ // src/models/lfm2_vl/processing_lfm2_vl.js
21521
+ var Lfm2VlProcessor = class extends Processor {
21522
+ static tokenizer_class = AutoTokenizer;
21523
+ static image_processor_class = AutoImageProcessor;
21524
+ /**
21525
+ * @param {RawImage|RawImage[]} images
21526
+ * @param {string|string[]|null} [text]
21527
+ * @param {Record<string, any>} [kwargs]
21528
+ */
21529
+ async _call(images, text = null, kwargs = {}) {
21530
+ const { image_rows, image_cols, image_sizes, ...image_inputs } = await this.image_processor(images, {
21531
+ ...kwargs,
21532
+ return_row_col_info: true
21533
+ });
21534
+ if (text) {
21535
+ const image_token = this.config.image_token ?? "<image>";
21536
+ const {
21537
+ tile_size = 512,
21538
+ downsample_factor = 2,
21539
+ encoder_patch_size = 16,
21540
+ use_thumbnail = true
21541
+ } = (
21542
+ /** @type {Record<string, any>} */
21543
+ this.image_processor.config
21544
+ );
21545
+ const ds2 = (s) => Math.ceil(Math.floor(s / encoder_patch_size) / downsample_factor);
21546
+ const tokens_per_tile = ds2(tile_size) ** 2;
21547
+ const image_start = this.config.image_start_token ?? "<|image_start|>";
21548
+ const image_end = this.config.image_end_token ?? "<|image_end|>";
21549
+ const thumbnail_token = this.config.image_thumbnail ?? "<|img_thumbnail|>";
21550
+ if (!Array.isArray(text)) text = [text];
21551
+ let image_idx = 0;
21552
+ text = text.map((sample) => {
21553
+ const parts = sample.split(image_token);
21554
+ return parts[0] + parts.slice(1).map((part) => {
21555
+ const idx = image_idx++;
21556
+ const [h, w] = image_sizes[idx];
21557
+ const rows = image_rows[idx], cols = image_cols[idx];
21558
+ const tokens_for_image = ds2(h) * ds2(w);
21559
+ let expanded = image_start;
21560
+ if (rows > 1 || cols > 1) {
21561
+ const tile_str = image_token.repeat(tokens_per_tile);
21562
+ for (let r = 0; r < rows; ++r)
21563
+ for (let c = 0; c < cols; ++c)
21564
+ expanded += `<|img_row_${r + 1}_col_${c + 1}|>` + tile_str;
21565
+ if (use_thumbnail) expanded += thumbnail_token + image_token.repeat(tokens_for_image);
21566
+ } else {
21567
+ expanded += image_token.repeat(tokens_for_image);
21568
+ }
21569
+ return expanded + image_end + part;
21570
+ }).join("");
21571
+ });
21572
+ }
21573
+ return {
21574
+ ...image_inputs,
21575
+ ...text ? this.tokenizer(text, kwargs) : {}
21576
+ };
21577
+ }
21578
+ };
21579
+
20679
21580
  // src/models/llava/processing_llava.js
20680
21581
  var LlavaProcessor = class extends Processor {
20681
21582
  static tokenizer_class = AutoTokenizer;
@@ -21019,47 +21920,6 @@ var PyAnnoteProcessor = class extends Processor {
21019
21920
  }
21020
21921
  };
21021
21922
 
21022
- // src/models/qwen2_vl/processing_qwen2_vl.js
21023
- var Qwen2VLProcessor = class extends Processor {
21024
- static image_processor_class = AutoImageProcessor;
21025
- static tokenizer_class = AutoTokenizer;
21026
- /**
21027
- *
21028
- * @param {string|string[]} text
21029
- * @param {RawImage|RawImage[]} images
21030
- * @param {...any} args
21031
- * @returns {Promise<any>}
21032
- */
21033
- async _call(text, images = null, ...args) {
21034
- if (!Array.isArray(text)) {
21035
- text = [text];
21036
- }
21037
- let image_inputs, image_grid_thw;
21038
- if (images) {
21039
- image_inputs = await this.image_processor(images);
21040
- image_grid_thw = image_inputs.image_grid_thw;
21041
- }
21042
- if (image_grid_thw) {
21043
- let merge_length = this.image_processor.config.merge_size ** 2;
21044
- let index = 0;
21045
- const image_grid_thw_list = image_grid_thw.tolist();
21046
- text = text.map((t) => {
21047
- while (t.includes("<|image_pad|>")) {
21048
- const prod = Number(image_grid_thw_list[index++].reduce((a, b) => a * b, 1n));
21049
- t = t.replace("<|image_pad|>", "<|placeholder|>".repeat(Math.floor(prod / merge_length)));
21050
- }
21051
- return t.replaceAll("<|placeholder|>", "<|image_pad|>");
21052
- });
21053
- }
21054
- const text_inputs = this.tokenizer(text);
21055
- return {
21056
- ...text_inputs,
21057
- ...image_inputs
21058
- // TODO: ...videos_inputs,
21059
- };
21060
- }
21061
- };
21062
-
21063
21923
  // src/models/qwen2_5_vl/processing_qwen2_5_vl.js
21064
21924
  var Qwen2_5_VLProcessor = class extends Qwen2VLProcessor {
21065
21925
  };
@@ -21208,6 +22068,94 @@ var VoxtralProcessor = class extends Processor {
21208
22068
  }
21209
22069
  };
21210
22070
 
22071
+ // src/models/voxtral_realtime/processing_voxtral_realtime.js
22072
+ var NUM_LEFT_PAD_TOKENS = 32;
22073
+ var NUM_DELAY_TOKENS = 6;
22074
+ var AUDIO_LENGTH_PER_TOK = 8;
22075
+ var OFFLINE_STREAMING_BUFFER_TOKENS = 10;
22076
+ var STREAMING_PAD_TOKEN_ID = 32;
22077
+ var VoxtralRealtimeProcessor = class extends Processor {
22078
+ static tokenizer_class = AutoTokenizer;
22079
+ static feature_extractor_class = AutoFeatureExtractor;
22080
+ static uses_processor_config = false;
22081
+ /** Number of mel frames in the first audio chunk. */
22082
+ get num_mel_frames_first_audio_chunk() {
22083
+ return (NUM_DELAY_TOKENS + 1) * AUDIO_LENGTH_PER_TOK;
22084
+ }
22085
+ /** Number of raw audio samples in the first audio chunk. */
22086
+ get num_samples_first_audio_chunk() {
22087
+ const { hop_length, n_fft } = this.feature_extractor.config;
22088
+ return (this.num_mel_frames_first_audio_chunk - 1) * hop_length + Math.floor(n_fft / 2);
22089
+ }
22090
+ /** Number of raw audio samples per subsequent audio chunk. */
22091
+ get num_samples_per_audio_chunk() {
22092
+ const { hop_length, n_fft } = this.feature_extractor.config;
22093
+ return AUDIO_LENGTH_PER_TOK * hop_length + n_fft;
22094
+ }
22095
+ /** Number of right-pad tokens for non-streaming mode. */
22096
+ get num_right_pad_tokens() {
22097
+ return NUM_DELAY_TOKENS + 1 + OFFLINE_STREAMING_BUFFER_TOKENS;
22098
+ }
22099
+ /** Number of mel frames per text token. */
22100
+ get audio_length_per_tok() {
22101
+ return AUDIO_LENGTH_PER_TOK;
22102
+ }
22103
+ /** Number of raw audio samples per token. */
22104
+ get raw_audio_length_per_tok() {
22105
+ return AUDIO_LENGTH_PER_TOK * this.feature_extractor.config.hop_length;
22106
+ }
22107
+ /**
22108
+ * Process audio input for VoxtralRealtime.
22109
+ *
22110
+ * In streaming mode with `is_first_audio_chunk=true`, the audio is left-padded
22111
+ * with silence and mel features are extracted with `center=true`.
22112
+ * Returns `{ input_ids, input_features }`.
22113
+ *
22114
+ * In streaming mode with `is_first_audio_chunk=false`, the audio chunk is
22115
+ * processed with `center=false` and only `{ input_features }` is returned.
22116
+ *
22117
+ * In non-streaming mode, the audio is right-padded to ensure the model
22118
+ * transcribes the full audio, then processed with `center=true`.
22119
+ * Returns `{ input_features }`.
22120
+ *
22121
+ * @param {Float32Array|Float64Array} audio The audio waveform.
22122
+ * @param {Object} [options]
22123
+ * @param {boolean} [options.is_streaming=false] Whether processing in streaming mode.
22124
+ * @param {boolean} [options.is_first_audio_chunk=true] Whether this is the first audio chunk.
22125
+ * @returns {Promise<Object>}
22126
+ */
22127
+ async _call(audio, { is_streaming = false, is_first_audio_chunk = true } = {}) {
22128
+ validate_audio_inputs(audio, "VoxtralRealtimeProcessor");
22129
+ if (!is_streaming && !is_first_audio_chunk) {
22130
+ throw new Error("In non-streaming mode (`is_streaming=false`), `is_first_audio_chunk` must be `true`.");
22131
+ }
22132
+ if (is_first_audio_chunk) {
22133
+ if (is_streaming) {
22134
+ const num_left_pad_samples = NUM_LEFT_PAD_TOKENS * this.raw_audio_length_per_tok;
22135
+ const padded_audio = new Float32Array(num_left_pad_samples + audio.length);
22136
+ padded_audio.set(audio, num_left_pad_samples);
22137
+ const audio_encoding = await this.feature_extractor(padded_audio, { center: true });
22138
+ const num_pad_tokens = NUM_LEFT_PAD_TOKENS + NUM_DELAY_TOKENS;
22139
+ const num_input_tokens = 1 + num_pad_tokens;
22140
+ const input_ids_data = new BigInt64Array(num_input_tokens).fill(BigInt(STREAMING_PAD_TOKEN_ID));
22141
+ input_ids_data[0] = 1n;
22142
+ const input_ids = new Tensor2("int64", input_ids_data, [1, num_input_tokens]);
22143
+ return {
22144
+ input_ids,
22145
+ ...audio_encoding
22146
+ };
22147
+ } else {
22148
+ const right_pad_samples = this.num_right_pad_tokens * this.raw_audio_length_per_tok;
22149
+ const padded_audio = new Float32Array(audio.length + right_pad_samples);
22150
+ padded_audio.set(audio);
22151
+ return await this.feature_extractor(padded_audio, { center: true });
22152
+ }
22153
+ } else {
22154
+ return await this.feature_extractor(audio, { center: false });
22155
+ }
22156
+ }
22157
+ };
22158
+
21211
22159
  // src/models/wav2vec2/processing_wav2vec2.js
21212
22160
  var Wav2Vec2Processor = class extends Processor {
21213
22161
  static tokenizer_class = AutoTokenizer;
@@ -21307,11 +22255,16 @@ function getNormalizedConfig(config) {
21307
22255
  case "florence2":
21308
22256
  case "llava_onevision":
21309
22257
  case "idefics3":
22258
+ case "granite_speech":
21310
22259
  case "ultravox":
21311
22260
  case "voxtral":
22261
+ case "voxtral_realtime":
21312
22262
  case "smolvlm":
21313
22263
  case "gemma3n":
22264
+ case "lfm2_vl":
21314
22265
  case "chatterbox":
22266
+ case "lighton_ocr":
22267
+ case "glm_ocr":
21315
22268
  case "mistral3":
21316
22269
  case "qwen2_5_vl":
21317
22270
  case "qwen3_vl":
@@ -21365,10 +22318,13 @@ function getNormalizedConfig(config) {
21365
22318
  case "cohere":
21366
22319
  case "cohere2":
21367
22320
  case "mistral":
22321
+ case "voxtral_realtime_text":
22322
+ case "voxtral_realtime_encoder":
21368
22323
  case "starcoder2":
21369
22324
  case "qwen2":
21370
22325
  case "qwen2_moe":
21371
22326
  case "qwen2_vl":
22327
+ case "qwen2_vl_text":
21372
22328
  case "qwen2_5_vl_text":
21373
22329
  case "qwen3_moe":
21374
22330
  case "qwen3_vl_text":
@@ -21384,6 +22340,8 @@ function getNormalizedConfig(config) {
21384
22340
  mapping["dim_kv"] = "head_dim";
21385
22341
  break;
21386
22342
  case "qwen3":
22343
+ case "solar_open":
22344
+ case "glm_ocr_text":
21387
22345
  case "gemma":
21388
22346
  case "gemma2":
21389
22347
  case "vaultgemma":
@@ -21394,6 +22352,7 @@ function getNormalizedConfig(config) {
21394
22352
  case "ernie4_5":
21395
22353
  case "hunyuan_v1_dense":
21396
22354
  case "falcon_h1":
22355
+ case "nemotron_h":
21397
22356
  case "ministral":
21398
22357
  case "ministral3":
21399
22358
  mapping["num_heads"] = "num_key_value_heads";
@@ -21428,6 +22387,9 @@ function getNormalizedConfig(config) {
21428
22387
  mapping["num_attention_heads"] = "num_attention_heads";
21429
22388
  break;
21430
22389
  case "youtu":
22390
+ case "deepseek_v3":
22391
+ case "glm_moe_dsa":
22392
+ case "mistral4":
21431
22393
  mapping["num_heads"] = "num_key_value_heads";
21432
22394
  mapping["num_layers"] = "num_hidden_layers";
21433
22395
  mapping["dim_kv"] = "qk_head_dim";
@@ -21513,6 +22475,10 @@ function getNormalizedConfig(config) {
21513
22475
  return normalized_config;
21514
22476
  }
21515
22477
  function getCacheShapes(config, options) {
22478
+ if (!(config instanceof PretrainedConfig)) {
22479
+ config = new PretrainedConfig(config);
22480
+ }
22481
+ const batch_size = options?.batch_size ?? 1;
21516
22482
  if (["lfm2", "lfm2_moe"].includes(config.model_type)) {
21517
22483
  const pkv_prefix = options?.prefix ?? "past_key_values";
21518
22484
  const conv_prefix = pkv_prefix === "present" ? "present" : "past";
@@ -21522,7 +22488,6 @@ function getCacheShapes(config, options) {
21522
22488
  config
21523
22489
  );
21524
22490
  const head_dim = hidden_size / num_attention_heads;
21525
- const batch_size = options?.batch_size ?? 1;
21526
22491
  for (let i = 0; i < layer_types.length; ++i) {
21527
22492
  if (layer_types[i] === "full_attention") {
21528
22493
  for (const kv of ["key", "value"]) {
@@ -21535,31 +22500,26 @@ function getCacheShapes(config, options) {
21535
22500
  }
21536
22501
  }
21537
22502
  return cache_values;
21538
- } else if (["granitemoehybrid", "falcon_h1"].includes(config.model_type)) {
22503
+ } else if (["granitemoehybrid", "falcon_h1", "nemotron_h"].includes(config.model_type)) {
21539
22504
  const pkv_prefix = options?.prefix ?? "past_key_values";
21540
22505
  const conv_prefix = pkv_prefix === "present" ? "present" : "past";
21541
- const cache_values = {};
21542
- const {
21543
- layer_types,
21544
- num_hidden_layers,
21545
- num_attention_heads,
21546
- num_key_value_heads,
21547
- hidden_size,
21548
- mamba_d_conv,
21549
- mamba_n_heads,
21550
- mamba_d_head,
21551
- mamba_d_state,
21552
- mamba_n_groups,
21553
- mamba_expand,
21554
- mamba_d_ssm
21555
- } = (
22506
+ const c = (
21556
22507
  /** @type {any} */
21557
22508
  config
21558
22509
  );
21559
- const head_dim = hidden_size / num_attention_heads;
21560
- const batch_size = options?.batch_size ?? 1;
21561
- const conv_d_inner = (mamba_d_ssm ?? mamba_expand * hidden_size) + 2 * mamba_n_groups * mamba_d_state;
21562
- for (let i = 0; i < num_hidden_layers; ++i) {
22510
+ const layer_types = c.layer_types ?? c.layers_block_type;
22511
+ const num_layers = c.num_hidden_layers ?? layer_types?.length;
22512
+ const num_key_value_heads = c.num_key_value_heads;
22513
+ const head_dim = c.head_dim ?? c.hidden_size / c.num_attention_heads;
22514
+ const mamba_n_heads = c.mamba_n_heads ?? c.mamba_num_heads;
22515
+ const mamba_d_head = c.mamba_d_head ?? c.mamba_head_dim;
22516
+ const mamba_d_state = c.mamba_d_state ?? c.ssm_state_size;
22517
+ const mamba_n_groups = c.mamba_n_groups ?? c.n_groups;
22518
+ const mamba_d_conv = c.mamba_d_conv ?? c.conv_kernel;
22519
+ const mamba_d_ssm = c.mamba_d_ssm ?? (c.mamba_expand ? c.mamba_expand * c.hidden_size : mamba_n_heads * mamba_d_head);
22520
+ const conv_d_inner = mamba_d_ssm + 2 * mamba_n_groups * mamba_d_state;
22521
+ const cache_values = {};
22522
+ for (let i = 0; i < num_layers; ++i) {
21563
22523
  if (!layer_types || layer_types[i] === "mamba") {
21564
22524
  cache_values[`${conv_prefix}_conv.${i}`] = [batch_size, conv_d_inner, mamba_d_conv];
21565
22525
  cache_values[`${conv_prefix}_ssm.${i}`] = [batch_size, mamba_n_heads, mamba_d_head, mamba_d_state];
@@ -21593,7 +22553,6 @@ function getCacheShapes(config, options) {
21593
22553
  const key_dim = linear_key_head_dim * linear_num_key_heads;
21594
22554
  const value_dim = linear_value_head_dim * linear_num_value_heads;
21595
22555
  const final_head_dim = head_dim ?? hidden_size / num_attention_heads;
21596
- const batch_size = options?.batch_size ?? 1;
21597
22556
  for (let i = 0; i < layer_types.length; ++i) {
21598
22557
  if (layer_types[i] === "full_attention") {
21599
22558
  for (const kv of ["key", "value"]) {
@@ -21619,12 +22578,16 @@ function getCacheShapes(config, options) {
21619
22578
  }
21620
22579
  }
21621
22580
  return cache_values;
21622
- } else if (["qwen3_5", "qwen3_5_moe"].includes(config.model_type)) {
21623
- return getCacheShapes(
21624
- /**@type {any} */
21625
- config.text_config,
21626
- options
21627
- );
22581
+ } else if (["lfm2_vl", "qwen3_5", "qwen3_5_moe", "voxtral_realtime"].includes(config.model_type)) {
22582
+ let subConfig;
22583
+ if (config.model_type === "voxtral_realtime" && options?.session_name === "audio_encoder") {
22584
+ subConfig = /** @type {any} */
22585
+ config.audio_config;
22586
+ } else {
22587
+ subConfig = /** @type {any} */
22588
+ config.text_config;
22589
+ }
22590
+ return getCacheShapes(subConfig, options);
21628
22591
  }
21629
22592
  return getKeyValueShapes(config, options);
21630
22593
  }
@@ -21790,7 +22753,7 @@ async function getModelDataFiles(pretrained_model_name_or_path, fileName, suffix
21790
22753
  }
21791
22754
 
21792
22755
  // src/models/session.js
21793
- async function getSession(pretrained_model_name_or_path, fileName, options, is_decoder = false) {
22756
+ async function getSession(pretrained_model_name_or_path, fileName, options, cache_config = false, session_name = void 0) {
21794
22757
  let custom_config = options.config?.["transformers.js_config"] ?? {};
21795
22758
  const selectedDevice = (
21796
22759
  /** @type {import("../utils/devices.js").DeviceType} */
@@ -21848,9 +22811,10 @@ async function getSession(pretrained_model_name_or_path, fileName, options, is_d
21848
22811
  if (externalData.length > 0 && !apis.IS_NODE_ENV) {
21849
22812
  session_options.externalData = externalData;
21850
22813
  }
21851
- if (is_decoder && selectedDevice === "webgpu" && kv_cache_dtype_config !== false) {
22814
+ if (cache_config && selectedDevice === "webgpu" && kv_cache_dtype_config !== false) {
21852
22815
  const shapes = getCacheShapes(options.config, {
21853
- prefix: "present"
22816
+ prefix: "present",
22817
+ session_name
21854
22818
  });
21855
22819
  if (Object.keys(shapes).length > 0 && !isONNXProxy()) {
21856
22820
  const preferredOutputLocation = {};
@@ -21868,15 +22832,17 @@ async function getSession(pretrained_model_name_or_path, fileName, options, is_d
21868
22832
  };
21869
22833
  return { buffer_or_path, session_options, session_config };
21870
22834
  }
21871
- async function constructSessions(pretrained_model_name_or_path, names, options, decoder_name = void 0) {
22835
+ async function constructSessions(pretrained_model_name_or_path, names, options, cache_sessions = void 0) {
21872
22836
  return Object.fromEntries(
21873
22837
  await Promise.all(
21874
22838
  Object.keys(names).map(async (name) => {
22839
+ const cache_config = cache_sessions?.[name] ?? false;
21875
22840
  const { buffer_or_path, session_options, session_config } = await getSession(
21876
22841
  pretrained_model_name_or_path,
21877
22842
  names[name],
21878
22843
  options,
21879
- name === decoder_name
22844
+ cache_config,
22845
+ name
21880
22846
  );
21881
22847
  const session = await createInferenceSession(buffer_or_path, session_options, session_config);
21882
22848
  return [name, session];
@@ -23176,19 +24142,71 @@ var BeamSearchSampler = class extends LogitsSampler {
23176
24142
  }
23177
24143
  };
23178
24144
 
24145
+ // src/cache_utils.js
24146
+ var _DynamicCache = class {
24147
+ /**
24148
+ * Create a DynamicCache, optionally pre-populated with entries.
24149
+ * @param {Record<string, Tensor>} [entries] Initial name→Tensor mappings.
24150
+ */
24151
+ constructor(entries) {
24152
+ if (!entries) return;
24153
+ for (const key in entries) {
24154
+ if (key in this) {
24155
+ throw new TypeError(`Key "${key}" conflicts with an existing property on DynamicCache`);
24156
+ }
24157
+ const value = entries[key];
24158
+ if (!(value instanceof Tensor2)) {
24159
+ throw new TypeError(`Expected a Tensor for key "${key}", got ${typeof value}`);
24160
+ }
24161
+ this[key] = value;
24162
+ }
24163
+ }
24164
+ /**
24165
+ * Get the cached sequence length. This requires at least one attention cache entry to be present.
24166
+ * @returns {number} The past sequence length.
24167
+ */
24168
+ get_seq_length() {
24169
+ const self2 = (
24170
+ /** @type {any} */
24171
+ this
24172
+ );
24173
+ for (const name in self2) {
24174
+ if (name.startsWith("past_key_values.")) {
24175
+ return self2[name].dims.at(-2);
24176
+ }
24177
+ }
24178
+ throw new Error("Unable to determine sequence length from the cache.");
24179
+ }
24180
+ /**
24181
+ * Dispose all contained tensors whose data resides on the GPU.
24182
+ * Returns a promise that resolves when all disposals are complete.
24183
+ * @returns {Promise<void>} Promise that resolves when all GPU tensors are disposed.
24184
+ */
24185
+ async dispose() {
24186
+ const promises = [];
24187
+ for (
24188
+ const t of
24189
+ /** @type {Tensor[]} */
24190
+ Object.values(this)
24191
+ ) {
24192
+ if (t.location === "gpu-buffer") {
24193
+ promises.push(t.dispose());
24194
+ }
24195
+ }
24196
+ await Promise.all(promises);
24197
+ }
24198
+ };
24199
+ var DynamicCache = (
24200
+ /** @type {new (entries?: Record<string, Tensor>) => DynamicCache} */
24201
+ /** @type {unknown} */
24202
+ _DynamicCache
24203
+ );
24204
+
23179
24205
  // src/models/modeling_utils.js
23180
24206
  var MODEL_MAPPING_NAMES = null;
23181
24207
  function registerTaskMappings(mappings) {
23182
24208
  MODEL_MAPPING_NAMES = mappings;
23183
24209
  }
23184
- function getPastLength(past_key_values) {
23185
- for (const name in past_key_values) {
23186
- if (name.startsWith("past_key_values.")) {
23187
- return past_key_values[name].dims.at(-2);
23188
- }
23189
- }
23190
- return Object.values(past_key_values)[0].dims.at(-2);
23191
- }
23192
24210
  function toI64Tensor(items) {
23193
24211
  if (items instanceof Tensor2) {
23194
24212
  return items;
@@ -23229,71 +24247,181 @@ var MODEL_TYPES = {
23229
24247
  AutoEncoder: 12,
23230
24248
  ImageAudioTextToText: 13,
23231
24249
  Supertonic: 14,
23232
- Chatterbox: 15
24250
+ Chatterbox: 15,
24251
+ MultimodalLanguageModelOnly: 16,
24252
+ VoxtralRealtime: 17
23233
24253
  };
23234
24254
  var MODEL_TYPE_CONFIG = {
23235
24255
  [MODEL_TYPES.DecoderOnly]: {
23236
24256
  can_generate: true,
23237
24257
  forward: decoder_forward,
23238
- prepare_inputs: decoder_prepare_inputs_for_generation
24258
+ prepare_inputs: decoder_prepare_inputs_for_generation,
24259
+ sessions: (config, options) => ({ model: options.model_file_name ?? "model" }),
24260
+ cache_sessions: { model: true },
24261
+ optional_configs: { generation_config: "generation_config.json" }
23239
24262
  },
23240
24263
  [MODEL_TYPES.DecoderOnlyWithoutHead]: {
23241
24264
  can_generate: false,
23242
24265
  forward: decoder_forward,
23243
- prepare_inputs: decoder_prepare_inputs_for_generation
24266
+ prepare_inputs: decoder_prepare_inputs_for_generation,
24267
+ sessions: (config, options) => ({ model: options.model_file_name ?? "model" })
23244
24268
  },
23245
24269
  [MODEL_TYPES.Seq2Seq]: {
23246
24270
  can_generate: true,
23247
24271
  forward: seq2seq_forward,
23248
- prepare_inputs: encoder_decoder_prepare_inputs_for_generation
24272
+ prepare_inputs: encoder_decoder_prepare_inputs_for_generation,
24273
+ sessions: () => ({ model: "encoder_model", decoder_model_merged: "decoder_model_merged" }),
24274
+ cache_sessions: { decoder_model_merged: true },
24275
+ optional_configs: { generation_config: "generation_config.json" }
23249
24276
  },
23250
24277
  [MODEL_TYPES.Vision2Seq]: {
23251
24278
  can_generate: true,
23252
24279
  forward: seq2seq_forward,
23253
- prepare_inputs: encoder_decoder_prepare_inputs_for_generation
24280
+ prepare_inputs: encoder_decoder_prepare_inputs_for_generation,
24281
+ sessions: () => ({ model: "encoder_model", decoder_model_merged: "decoder_model_merged" }),
24282
+ cache_sessions: { decoder_model_merged: true },
24283
+ optional_configs: { generation_config: "generation_config.json" }
23254
24284
  },
23255
24285
  [MODEL_TYPES.Musicgen]: {
23256
24286
  can_generate: true,
23257
- forward: seq2seq_forward
24287
+ forward: seq2seq_forward,
24288
+ sessions: () => ({
24289
+ model: "text_encoder",
24290
+ decoder_model_merged: "decoder_model_merged",
24291
+ encodec_decode: "encodec_decode"
24292
+ }),
24293
+ cache_sessions: { decoder_model_merged: true },
24294
+ optional_configs: { generation_config: "generation_config.json" }
23258
24295
  },
23259
24296
  [MODEL_TYPES.EncoderDecoder]: {
23260
24297
  can_generate: false,
23261
- forward: seq2seq_forward
24298
+ forward: seq2seq_forward,
24299
+ sessions: () => ({ model: "encoder_model", decoder_model_merged: "decoder_model_merged" }),
24300
+ cache_sessions: { decoder_model_merged: true }
24301
+ },
24302
+ [MODEL_TYPES.MaskGeneration]: {
24303
+ sessions: () => ({ model: "vision_encoder", prompt_encoder_mask_decoder: "prompt_encoder_mask_decoder" })
23262
24304
  },
23263
24305
  [MODEL_TYPES.ImageTextToText]: {
23264
24306
  can_generate: true,
23265
24307
  forward: image_text_to_text_forward,
23266
- prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
24308
+ prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
24309
+ sessions: (config) => {
24310
+ const s = {
24311
+ embed_tokens: "embed_tokens",
24312
+ vision_encoder: "vision_encoder",
24313
+ decoder_model_merged: "decoder_model_merged"
24314
+ };
24315
+ if (config.is_encoder_decoder) s["model"] = "encoder_model";
24316
+ return s;
24317
+ },
24318
+ cache_sessions: { decoder_model_merged: true },
24319
+ optional_configs: { generation_config: "generation_config.json" }
23267
24320
  },
23268
24321
  [MODEL_TYPES.AudioTextToText]: {
23269
24322
  can_generate: true,
23270
24323
  forward: audio_text_to_text_forward,
23271
- prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
24324
+ prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
24325
+ sessions: () => ({
24326
+ embed_tokens: "embed_tokens",
24327
+ audio_encoder: "audio_encoder",
24328
+ decoder_model_merged: "decoder_model_merged"
24329
+ }),
24330
+ cache_sessions: { decoder_model_merged: true },
24331
+ optional_configs: { generation_config: "generation_config.json" }
23272
24332
  },
23273
- [MODEL_TYPES.Phi3V]: {
24333
+ [MODEL_TYPES.ImageAudioTextToText]: {
23274
24334
  can_generate: true,
23275
- prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
24335
+ prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
24336
+ sessions: () => ({
24337
+ embed_tokens: "embed_tokens",
24338
+ audio_encoder: "audio_encoder",
24339
+ vision_encoder: "vision_encoder",
24340
+ decoder_model_merged: "decoder_model_merged"
24341
+ }),
24342
+ optional_configs: { generation_config: "generation_config.json" }
23276
24343
  },
23277
- [MODEL_TYPES.ImageAudioTextToText]: {
24344
+ [MODEL_TYPES.Phi3V]: {
23278
24345
  can_generate: true,
23279
- prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
24346
+ prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
24347
+ sessions: () => ({
24348
+ prepare_inputs_embeds: "prepare_inputs_embeds",
24349
+ model: "model",
24350
+ vision_encoder: "vision_encoder"
24351
+ }),
24352
+ cache_sessions: { model: true },
24353
+ optional_configs: { generation_config: "generation_config.json" }
23280
24354
  },
23281
24355
  [MODEL_TYPES.MultiModality]: {
23282
- can_generate: true
24356
+ can_generate: true,
24357
+ sessions: () => ({
24358
+ prepare_inputs_embeds: "prepare_inputs_embeds",
24359
+ model: "language_model",
24360
+ lm_head: "lm_head",
24361
+ gen_head: "gen_head",
24362
+ gen_img_embeds: "gen_img_embeds",
24363
+ image_decode: "image_decode"
24364
+ }),
24365
+ cache_sessions: { model: true },
24366
+ optional_configs: { generation_config: "generation_config.json" }
23283
24367
  },
23284
24368
  [MODEL_TYPES.AutoEncoder]: {
23285
24369
  can_generate: false,
23286
- forward: auto_encoder_forward
24370
+ forward: auto_encoder_forward,
24371
+ sessions: () => ({ encoder_model: "encoder_model", decoder_model: "decoder_model" })
24372
+ },
24373
+ [MODEL_TYPES.Supertonic]: {
24374
+ sessions: () => ({
24375
+ text_encoder: "text_encoder",
24376
+ latent_denoiser: "latent_denoiser",
24377
+ voice_decoder: "voice_decoder"
24378
+ })
23287
24379
  },
23288
24380
  [MODEL_TYPES.Chatterbox]: {
23289
24381
  can_generate: true,
23290
- forward: encoder_forward
24382
+ forward: encoder_forward,
24383
+ sessions: () => ({
24384
+ embed_tokens: "embed_tokens",
24385
+ speech_encoder: "speech_encoder",
24386
+ model: "language_model",
24387
+ conditional_decoder: "conditional_decoder"
24388
+ }),
24389
+ cache_sessions: { model: true },
24390
+ optional_configs: { generation_config: "generation_config.json" }
24391
+ },
24392
+ [MODEL_TYPES.MultimodalLanguageModelOnly]: {
24393
+ can_generate: true,
24394
+ forward: image_text_to_text_forward,
24395
+ prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
24396
+ sessions: () => ({ embed_tokens: "embed_tokens", decoder_model_merged: "decoder_model_merged" }),
24397
+ cache_sessions: { decoder_model_merged: true },
24398
+ optional_configs: { generation_config: "generation_config.json" }
24399
+ },
24400
+ [MODEL_TYPES.VoxtralRealtime]: {
24401
+ can_generate: true,
24402
+ prepare_inputs: decoder_prepare_inputs_for_generation,
24403
+ sessions: () => ({
24404
+ embed_tokens: "embed_tokens",
24405
+ audio_encoder: "audio_encoder",
24406
+ decoder_model_merged: "decoder_model_merged"
24407
+ }),
24408
+ cache_sessions: { decoder_model_merged: true, audio_encoder: true },
24409
+ optional_configs: { generation_config: "generation_config.json" }
23291
24410
  },
23292
24411
  default: {
23293
24412
  can_generate: false,
23294
- forward: encoder_forward
24413
+ forward: encoder_forward,
24414
+ sessions: (config, options) => ({ model: options.model_file_name ?? "model" })
23295
24415
  }
23296
24416
  };
24417
+ function getSessionsConfig(modelType, config, options = {}) {
24418
+ const typeConfig = MODEL_TYPE_CONFIG[modelType] ?? MODEL_TYPE_CONFIG.default;
24419
+ return {
24420
+ sessions: typeConfig.sessions(config, options),
24421
+ cache_sessions: typeConfig.cache_sessions,
24422
+ optional_configs: typeConfig.optional_configs
24423
+ };
24424
+ }
23297
24425
  var MODEL_TYPE_MAPPING = /* @__PURE__ */ new Map();
23298
24426
  var MODEL_NAME_TO_CLASS_MAPPING = /* @__PURE__ */ new Map();
23299
24427
  var MODEL_CLASS_TO_NAME_MAPPING = /* @__PURE__ */ new Map();
@@ -23379,245 +24507,23 @@ var PreTrainedModel = class extends Callable2 {
23379
24507
  const modelName = MODEL_CLASS_TO_NAME_MAPPING.get(this);
23380
24508
  const modelType = MODEL_TYPE_MAPPING.get(modelName);
23381
24509
  config = options.config = await AutoConfig.from_pretrained(pretrained_model_name_or_path, options);
23382
- let info;
23383
- if (modelType === MODEL_TYPES.DecoderOnly) {
23384
- info = await Promise.all([
23385
- constructSessions(
23386
- pretrained_model_name_or_path,
23387
- {
23388
- model: options.model_file_name ?? "model"
23389
- },
23390
- options,
23391
- "model"
23392
- ),
23393
- get_optional_configs(
23394
- pretrained_model_name_or_path,
23395
- {
23396
- generation_config: "generation_config.json"
23397
- },
23398
- options
23399
- )
23400
- ]);
23401
- } else if (modelType === MODEL_TYPES.Seq2Seq || modelType === MODEL_TYPES.Vision2Seq) {
23402
- info = await Promise.all([
23403
- constructSessions(
23404
- pretrained_model_name_or_path,
23405
- {
23406
- model: "encoder_model",
23407
- decoder_model_merged: "decoder_model_merged"
23408
- },
23409
- options,
23410
- "decoder_model_merged"
23411
- ),
23412
- get_optional_configs(
23413
- pretrained_model_name_or_path,
23414
- {
23415
- generation_config: "generation_config.json"
23416
- },
23417
- options
23418
- )
23419
- ]);
23420
- } else if (modelType === MODEL_TYPES.MaskGeneration) {
23421
- info = await Promise.all([
23422
- constructSessions(
23423
- pretrained_model_name_or_path,
23424
- {
23425
- model: "vision_encoder",
23426
- prompt_encoder_mask_decoder: "prompt_encoder_mask_decoder"
23427
- },
23428
- options
23429
- )
23430
- ]);
23431
- } else if (modelType === MODEL_TYPES.EncoderDecoder) {
23432
- info = await Promise.all([
23433
- constructSessions(
23434
- pretrained_model_name_or_path,
23435
- {
23436
- model: "encoder_model",
23437
- decoder_model_merged: "decoder_model_merged"
23438
- },
23439
- options,
23440
- "decoder_model_merged"
23441
- )
23442
- ]);
23443
- } else if (modelType === MODEL_TYPES.ImageTextToText) {
23444
- const sessions = {
23445
- embed_tokens: "embed_tokens",
23446
- vision_encoder: "vision_encoder",
23447
- decoder_model_merged: "decoder_model_merged"
23448
- };
23449
- if (config.is_encoder_decoder) {
23450
- sessions["model"] = "encoder_model";
23451
- }
23452
- info = await Promise.all([
23453
- constructSessions(pretrained_model_name_or_path, sessions, options, "decoder_model_merged"),
23454
- get_optional_configs(
23455
- pretrained_model_name_or_path,
23456
- {
23457
- generation_config: "generation_config.json"
23458
- },
23459
- options
23460
- )
23461
- ]);
23462
- } else if (modelType === MODEL_TYPES.AudioTextToText) {
23463
- const sessions = {
23464
- embed_tokens: "embed_tokens",
23465
- audio_encoder: "audio_encoder",
23466
- decoder_model_merged: "decoder_model_merged"
23467
- };
23468
- info = await Promise.all([
23469
- constructSessions(pretrained_model_name_or_path, sessions, options, "decoder_model_merged"),
23470
- get_optional_configs(
23471
- pretrained_model_name_or_path,
23472
- {
23473
- generation_config: "generation_config.json"
23474
- },
23475
- options
23476
- )
23477
- ]);
23478
- } else if (modelType === MODEL_TYPES.ImageAudioTextToText) {
23479
- const sessions = {
23480
- embed_tokens: "embed_tokens",
23481
- audio_encoder: "audio_encoder",
23482
- vision_encoder: "vision_encoder",
23483
- decoder_model_merged: "decoder_model_merged"
23484
- };
23485
- info = await Promise.all([
23486
- constructSessions(pretrained_model_name_or_path, sessions, options),
23487
- get_optional_configs(
23488
- pretrained_model_name_or_path,
23489
- {
23490
- generation_config: "generation_config.json"
23491
- },
23492
- options
23493
- )
23494
- ]);
23495
- } else if (modelType === MODEL_TYPES.Musicgen) {
23496
- info = await Promise.all([
23497
- constructSessions(
23498
- pretrained_model_name_or_path,
23499
- {
23500
- model: "text_encoder",
23501
- decoder_model_merged: "decoder_model_merged",
23502
- encodec_decode: "encodec_decode"
23503
- },
23504
- options,
23505
- "decoder_model_merged"
23506
- ),
23507
- get_optional_configs(
23508
- pretrained_model_name_or_path,
23509
- {
23510
- generation_config: "generation_config.json"
23511
- },
23512
- options
23513
- )
23514
- ]);
23515
- } else if (modelType === MODEL_TYPES.MultiModality) {
23516
- info = await Promise.all([
23517
- constructSessions(
23518
- pretrained_model_name_or_path,
23519
- {
23520
- prepare_inputs_embeds: "prepare_inputs_embeds",
23521
- model: "language_model",
23522
- lm_head: "lm_head",
23523
- gen_head: "gen_head",
23524
- gen_img_embeds: "gen_img_embeds",
23525
- image_decode: "image_decode"
23526
- },
23527
- options,
23528
- "model"
23529
- ),
23530
- get_optional_configs(
23531
- pretrained_model_name_or_path,
23532
- {
23533
- generation_config: "generation_config.json"
23534
- },
23535
- options
23536
- )
23537
- ]);
23538
- } else if (modelType === MODEL_TYPES.Phi3V) {
23539
- info = await Promise.all([
23540
- constructSessions(
23541
- pretrained_model_name_or_path,
23542
- {
23543
- prepare_inputs_embeds: "prepare_inputs_embeds",
23544
- model: "model",
23545
- vision_encoder: "vision_encoder"
23546
- },
23547
- options,
23548
- "model"
23549
- ),
23550
- get_optional_configs(
23551
- pretrained_model_name_or_path,
23552
- {
23553
- generation_config: "generation_config.json"
23554
- },
23555
- options
23556
- )
23557
- ]);
23558
- } else if (modelType === MODEL_TYPES.Chatterbox) {
23559
- info = await Promise.all([
23560
- constructSessions(
23561
- pretrained_model_name_or_path,
23562
- {
23563
- embed_tokens: "embed_tokens",
23564
- speech_encoder: "speech_encoder",
23565
- model: "language_model",
23566
- conditional_decoder: "conditional_decoder"
23567
- },
23568
- options,
23569
- "model"
23570
- ),
23571
- get_optional_configs(
23572
- pretrained_model_name_or_path,
23573
- {
23574
- generation_config: "generation_config.json"
23575
- },
23576
- options
23577
- )
23578
- ]);
23579
- } else if (modelType === MODEL_TYPES.AutoEncoder) {
23580
- info = await Promise.all([
23581
- constructSessions(
23582
- pretrained_model_name_or_path,
23583
- {
23584
- encoder_model: "encoder_model",
23585
- decoder_model: "decoder_model"
23586
- },
23587
- options
23588
- )
23589
- ]);
23590
- } else if (modelType === MODEL_TYPES.Supertonic) {
23591
- info = await Promise.all([
23592
- constructSessions(
23593
- pretrained_model_name_or_path,
23594
- {
23595
- text_encoder: "text_encoder",
23596
- latent_denoiser: "latent_denoiser",
23597
- voice_decoder: "voice_decoder"
23598
- },
23599
- options
23600
- )
23601
- ]);
23602
- } else {
23603
- if (modelType === void 0) {
23604
- const type = modelName ?? config?.model_type;
23605
- if (type !== "custom") {
23606
- logger.warn(
23607
- `Model type for '${type}' not found, assuming encoder-only architecture. Please report this at ${GITHUB_ISSUE_URL}.`
23608
- );
23609
- }
24510
+ const typeConfig = MODEL_TYPE_CONFIG[modelType] ?? MODEL_TYPE_CONFIG.default;
24511
+ if (modelType === void 0) {
24512
+ const type = modelName ?? config?.model_type;
24513
+ if (type !== "custom") {
24514
+ logger.warn(
24515
+ `Model type for '${type}' not found, assuming encoder-only architecture. Please report this at ${GITHUB_ISSUE_URL}.`
24516
+ );
23610
24517
  }
23611
- info = await Promise.all([
23612
- constructSessions(
23613
- pretrained_model_name_or_path,
23614
- {
23615
- model: options.model_file_name ?? "model"
23616
- },
23617
- options
23618
- )
23619
- ]);
23620
24518
  }
24519
+ const sessions = typeConfig.sessions(config, options);
24520
+ const promises = [
24521
+ constructSessions(pretrained_model_name_or_path, sessions, options, typeConfig.cache_sessions)
24522
+ ];
24523
+ if (typeConfig.optional_configs) {
24524
+ promises.push(get_optional_configs(pretrained_model_name_or_path, typeConfig.optional_configs, options));
24525
+ }
24526
+ const info = await Promise.all(promises);
23621
24527
  return new this(config, ...info);
23622
24528
  }
23623
24529
  /**
@@ -23816,7 +24722,7 @@ var PreTrainedModel = class extends Callable2 {
23816
24722
  * @param {Tensor} [params.inputs=null]
23817
24723
  * @param {number} [params.bos_token_id=null]
23818
24724
  * @param {Record<string, Tensor|number[]>} [params.model_kwargs]
23819
- * @returns {{inputs_tensor: Tensor, model_inputs: Record<string, Tensor>, model_input_name: string}} The model-specific inputs for generation.
24725
+ * @returns {{inputs_tensor: Tensor, model_inputs: Record<string, Tensor> & {past_key_values?: DynamicCache}, model_input_name: string}} The model-specific inputs for generation.
23820
24726
  */
23821
24727
  _prepare_model_inputs({ inputs, bos_token_id, model_kwargs }) {
23822
24728
  const model_inputs = pick(model_kwargs, this.forward_params);
@@ -24057,11 +24963,12 @@ var PreTrainedModel = class extends Callable2 {
24057
24963
  }
24058
24964
  }
24059
24965
  /**
24060
- * Returns an object containing past key values from the given decoder results object.
24966
+ * Returns a DynamicCache containing past key values from the given decoder results object.
24061
24967
  *
24062
24968
  * @param {Object} decoderResults The decoder results object.
24063
- * @param {Object} pastKeyValues The previous past key values.
24064
- * @returns {Object} An object containing past key values.
24969
+ * @param {DynamicCache} pastKeyValues The previous past key values.
24970
+ * @param {boolean} [disposeEncoderPKVs=false] Whether to dispose encoder past key values.
24971
+ * @returns {DynamicCache} A new DynamicCache containing the updated past key values.
24065
24972
  */
24066
24973
  getPastKeyValues(decoderResults, pastKeyValues, disposeEncoderPKVs = false) {
24067
24974
  const pkvs = /* @__PURE__ */ Object.create(null);
@@ -24082,7 +24989,7 @@ var PreTrainedModel = class extends Callable2 {
24082
24989
  }
24083
24990
  }
24084
24991
  }
24085
- return pkvs;
24992
+ return new DynamicCache(pkvs);
24086
24993
  }
24087
24994
  /**
24088
24995
  * Returns an object containing attentions from the given model output object.
@@ -24107,8 +25014,8 @@ var PreTrainedModel = class extends Callable2 {
24107
25014
  /**
24108
25015
  * Adds past key values to the decoder feeds object. If pastKeyValues is null, creates new tensors for past key values.
24109
25016
  *
24110
- * @param {Object} decoderFeeds The decoder feeds object to add past key values to.
24111
- * @param {Object} pastKeyValues An object containing past key values.
25017
+ * @param {Record<string, any>} decoderFeeds The decoder feeds object to add past key values to.
25018
+ * @param {DynamicCache|null} pastKeyValues The cache containing past key values.
24112
25019
  */
24113
25020
  addPastKeyValues(decoderFeeds, pastKeyValues) {
24114
25021
  if (pastKeyValues) {
@@ -24125,14 +25032,29 @@ var PreTrainedModel = class extends Callable2 {
24125
25032
  }
24126
25033
  }
24127
25034
  }
24128
- async encode_image({ pixel_values }) {
24129
- return (await sessionRun(this.sessions["vision_encoder"], { pixel_values })).image_features;
25035
+ /**
25036
+ * Helper function to select valid inputs and run through the appropriate encoder (vision, text, audio) based on the input type.
25037
+ * @param {string} sessionName
25038
+ * @param {Record<string, Tensor>} inputs
25039
+ * @param {string} outputName
25040
+ * @private
25041
+ */
25042
+ async _encode_input(sessionName, inputs, outputName) {
25043
+ if (!Object.hasOwn(this.sessions, sessionName)) {
25044
+ throw new Error(`Model does not have a ${sessionName} session.`);
25045
+ }
25046
+ const session = this.sessions[sessionName];
25047
+ const output = await sessionRun(session, pick(inputs, session.inputNames));
25048
+ return output[outputName];
24130
25049
  }
24131
- async encode_text({ input_ids }) {
24132
- return (await sessionRun(this.sessions["embed_tokens"], { input_ids })).inputs_embeds;
25050
+ async encode_image(inputs) {
25051
+ return this._encode_input("vision_encoder", inputs, "image_features");
24133
25052
  }
24134
- async encode_audio({ audio_values }) {
24135
- return (await sessionRun(this.sessions["audio_encoder"], { audio_values })).audio_features;
25053
+ async encode_text(inputs) {
25054
+ return this._encode_input("embed_tokens", inputs, "inputs_embeds");
25055
+ }
25056
+ async encode_audio(inputs) {
25057
+ return this._encode_input("audio_encoder", inputs, "audio_features");
24136
25058
  }
24137
25059
  };
24138
25060
  async function seq2seq_forward(self2, model_inputs) {
@@ -24187,6 +25109,9 @@ async function decoder_forward(self2, model_inputs, is_encoder_decoder = false)
24187
25109
  const start_index = ["paligemma", "gemma3_text", "gemma3"].includes(self2.config.model_type) ? 1 : 0;
24188
25110
  new_model_inputs.position_ids = create_position_ids(new_model_inputs, past_key_values, start_index);
24189
25111
  }
25112
+ if (session.inputNames.includes("num_logits_to_keep") && !new_model_inputs.num_logits_to_keep) {
25113
+ new_model_inputs.num_logits_to_keep = new Tensor2("int64", [0n], []);
25114
+ }
24190
25115
  self2.addPastKeyValues(new_model_inputs, past_key_values);
24191
25116
  const fixed = pick(new_model_inputs, session.inputNames);
24192
25117
  return await sessionRun(session, fixed);
@@ -24195,7 +25120,7 @@ async function generic_text_to_text_forward(self2, {
24195
25120
  // Generic parameters:
24196
25121
  encode_function,
24197
25122
  merge_function,
24198
- modality_input_name,
25123
+ modality_input_names,
24199
25124
  modality_output_name,
24200
25125
  // Produced by the tokenizer/processor:
24201
25126
  input_ids = null,
@@ -24210,32 +25135,34 @@ async function generic_text_to_text_forward(self2, {
24210
25135
  // Additional parameters
24211
25136
  ...kwargs
24212
25137
  }) {
24213
- const modality_values = kwargs[modality_input_name];
24214
25138
  if (!inputs_embeds) {
24215
25139
  inputs_embeds = await self2.encode_text({ input_ids, ...kwargs });
24216
- if (modality_values && input_ids.dims[1] !== 1) {
24217
- const modality_features = await encode_function({
24218
- // Pass the modality values under its expected key.
24219
- // The caller knows whether this is audio or image.
24220
- [modality_input_name]: modality_values,
24221
- ...kwargs
24222
- });
24223
- ({ inputs_embeds, attention_mask } = merge_function({
24224
- [modality_output_name]: modality_features,
24225
- inputs_embeds,
24226
- input_ids,
24227
- attention_mask
24228
- }));
24229
- } else if (past_key_values && modality_values && input_ids.dims[1] === 1) {
24230
- const target_length = input_ids.dims[1];
24231
- const past_length = getPastLength(past_key_values);
24232
- attention_mask = cat(
24233
- [
24234
- ones([input_ids.dims[0], past_length]),
24235
- attention_mask.slice(null, [attention_mask.dims[1] - target_length, attention_mask.dims[1]])
24236
- ],
24237
- 1
24238
- );
25140
+ const modality_values = pick(kwargs, modality_input_names);
25141
+ if (Object.keys(modality_values).length > 0) {
25142
+ if (input_ids.dims[1] !== 1) {
25143
+ const modality_features = await encode_function({
25144
+ // Pass the modality values under its expected key.
25145
+ // The caller knows whether this is audio or image.
25146
+ ...modality_values,
25147
+ ...kwargs
25148
+ });
25149
+ ({ inputs_embeds, attention_mask } = merge_function({
25150
+ [modality_output_name]: modality_features,
25151
+ inputs_embeds,
25152
+ input_ids,
25153
+ attention_mask
25154
+ }));
25155
+ } else if (past_key_values && input_ids.dims[1] === 1) {
25156
+ const target_length = input_ids.dims[1];
25157
+ const past_length = past_key_values.get_seq_length();
25158
+ attention_mask = cat(
25159
+ [
25160
+ ones([input_ids.dims[0], past_length]),
25161
+ attention_mask.slice(null, [attention_mask.dims[1] - target_length, attention_mask.dims[1]])
25162
+ ],
25163
+ 1
25164
+ );
25165
+ }
24239
25166
  }
24240
25167
  }
24241
25168
  if (!position_ids) {
@@ -24243,14 +25170,19 @@ async function generic_text_to_text_forward(self2, {
24243
25170
  // Handle special case for qwen vl models
24244
25171
  [
24245
25172
  "qwen2_vl",
25173
+ "qwen2_vl_text",
24246
25174
  "qwen2_5_vl",
24247
25175
  "qwen2_5_vl_text",
24248
25176
  "qwen3_vl",
24249
25177
  "qwen3_vl_text",
25178
+ "qwen3_vl_moe",
25179
+ "qwen3_vl_moe_text",
24250
25180
  "qwen3_5",
24251
25181
  "qwen3_5_text",
24252
25182
  "qwen3_5_moe",
24253
- "qwen3_5_moe_text"
25183
+ "qwen3_5_moe_text",
25184
+ "glm_ocr",
25185
+ "glm_ocr_text"
24254
25186
  ].includes(self2.config.model_type)
24255
25187
  ) {
24256
25188
  const { image_grid_thw, video_grid_thw } = kwargs;
@@ -24274,7 +25206,7 @@ async function generic_text_to_text_forward(self2, {
24274
25206
  async function audio_text_to_text_forward(self2, params) {
24275
25207
  return await generic_text_to_text_forward(self2, {
24276
25208
  ...params,
24277
- modality_input_name: "audio_values",
25209
+ modality_input_names: ["audio_values", "input_features"],
24278
25210
  modality_output_name: "audio_features",
24279
25211
  encode_function: self2.encode_audio.bind(self2),
24280
25212
  merge_function: self2._merge_input_ids_with_audio_features.bind(self2)
@@ -24283,7 +25215,7 @@ async function audio_text_to_text_forward(self2, params) {
24283
25215
  async function image_text_to_text_forward(self2, params) {
24284
25216
  return await generic_text_to_text_forward(self2, {
24285
25217
  ...params,
24286
- modality_input_name: "pixel_values",
25218
+ modality_input_names: ["pixel_values"],
24287
25219
  modality_output_name: "image_features",
24288
25220
  encode_function: self2.encode_image.bind(self2),
24289
25221
  merge_function: self2._merge_input_ids_with_image_features.bind(self2)
@@ -24319,7 +25251,11 @@ function create_position_ids(model_inputs, past_key_values = null, start_index =
24319
25251
  return position_ids;
24320
25252
  }
24321
25253
  function decoder_prepare_inputs_for_generation(self2, input_ids, model_inputs, generation_config) {
24322
- const past_length = model_inputs.past_key_values ? getPastLength(model_inputs.past_key_values) : 0;
25254
+ const past_length = model_inputs.past_key_values ? model_inputs.past_key_values.get_seq_length() : 0;
25255
+ const session = self2.sessions["decoder_model_merged"] ?? self2.sessions["model"];
25256
+ if (session?.inputNames.includes("num_logits_to_keep") && !model_inputs.num_logits_to_keep) {
25257
+ model_inputs.num_logits_to_keep = new Tensor2("int64", [1n], []);
25258
+ }
24323
25259
  if (!model_inputs.attention_mask) {
24324
25260
  let dims;
24325
25261
  for (const key of ["input_ids", "inputs_embeds", "position_ids"]) {
@@ -24470,6 +25406,8 @@ __export(models_exports, {
24470
25406
  BloomForCausalLM: () => BloomForCausalLM,
24471
25407
  BloomModel: () => BloomModel,
24472
25408
  BloomPreTrainedModel: () => BloomPreTrainedModel,
25409
+ CHMv2ForDepthEstimation: () => CHMv2ForDepthEstimation,
25410
+ CHMv2PreTrainedModel: () => CHMv2PreTrainedModel,
24473
25411
  CLIPModel: () => CLIPModel,
24474
25412
  CLIPPreTrainedModel: () => CLIPPreTrainedModel,
24475
25413
  CLIPSegForImageSegmentation: () => CLIPSegForImageSegmentation,
@@ -24544,6 +25482,9 @@ __export(models_exports, {
24544
25482
  DebertaV2PreTrainedModel: () => DebertaV2PreTrainedModel,
24545
25483
  DecisionTransformerModel: () => DecisionTransformerModel,
24546
25484
  DecisionTransformerPreTrainedModel: () => DecisionTransformerPreTrainedModel,
25485
+ DeepseekV3ForCausalLM: () => DeepseekV3ForCausalLM,
25486
+ DeepseekV3Model: () => DeepseekV3Model,
25487
+ DeepseekV3PreTrainedModel: () => DeepseekV3PreTrainedModel,
24547
25488
  DeiTForImageClassification: () => DeiTForImageClassification,
24548
25489
  DeiTModel: () => DeiTModel,
24549
25490
  DeiTPreTrainedModel: () => DeiTPreTrainedModel,
@@ -24589,6 +25530,11 @@ __export(models_exports, {
24589
25530
  EsmForTokenClassification: () => EsmForTokenClassification,
24590
25531
  EsmModel: () => EsmModel,
24591
25532
  EsmPreTrainedModel: () => EsmPreTrainedModel,
25533
+ EuroBertForMaskedLM: () => EuroBertForMaskedLM,
25534
+ EuroBertForSequenceClassification: () => EuroBertForSequenceClassification,
25535
+ EuroBertForTokenClassification: () => EuroBertForTokenClassification,
25536
+ EuroBertModel: () => EuroBertModel,
25537
+ EuroBertPreTrainedModel: () => EuroBertPreTrainedModel,
24592
25538
  ExaoneForCausalLM: () => ExaoneForCausalLM,
24593
25539
  ExaoneModel: () => ExaoneModel,
24594
25540
  ExaonePreTrainedModel: () => ExaonePreTrainedModel,
@@ -24627,6 +25573,7 @@ __export(models_exports, {
24627
25573
  Gemma3ForCausalLM: () => Gemma3ForCausalLM,
24628
25574
  Gemma3Model: () => Gemma3Model,
24629
25575
  Gemma3PreTrainedModel: () => Gemma3PreTrainedModel,
25576
+ Gemma3nForCausalLM: () => Gemma3nForCausalLM,
24630
25577
  Gemma3nForConditionalGeneration: () => Gemma3nForConditionalGeneration,
24631
25578
  Gemma3nPreTrainedModel: () => Gemma3nPreTrainedModel,
24632
25579
  GemmaForCausalLM: () => GemmaForCausalLM,
@@ -24634,6 +25581,10 @@ __export(models_exports, {
24634
25581
  GemmaPreTrainedModel: () => GemmaPreTrainedModel,
24635
25582
  GlmForCausalLM: () => GlmForCausalLM,
24636
25583
  GlmModel: () => GlmModel,
25584
+ GlmMoeDsaForCausalLM: () => GlmMoeDsaForCausalLM,
25585
+ GlmMoeDsaModel: () => GlmMoeDsaModel,
25586
+ GlmMoeDsaPreTrainedModel: () => GlmMoeDsaPreTrainedModel,
25587
+ GlmOcrForConditionalGeneration: () => GlmOcrForConditionalGeneration,
24637
25588
  GlmPreTrainedModel: () => GlmPreTrainedModel,
24638
25589
  GptOssForCausalLM: () => GptOssForCausalLM,
24639
25590
  GptOssModel: () => GptOssModel,
@@ -24644,6 +25595,7 @@ __export(models_exports, {
24644
25595
  GraniteMoeHybridModel: () => GraniteMoeHybridModel,
24645
25596
  GraniteMoeHybridPreTrainedModel: () => GraniteMoeHybridPreTrainedModel,
24646
25597
  GranitePreTrainedModel: () => GranitePreTrainedModel,
25598
+ GraniteSpeechForConditionalGeneration: () => GraniteSpeechForConditionalGeneration,
24647
25599
  GroundingDinoForObjectDetection: () => GroundingDinoForObjectDetection,
24648
25600
  GroundingDinoPreTrainedModel: () => GroundingDinoPreTrainedModel,
24649
25601
  GroupViTModel: () => GroupViTModel,
@@ -24665,7 +25617,6 @@ __export(models_exports, {
24665
25617
  IJepaModel: () => IJepaModel,
24666
25618
  IJepaPreTrainedModel: () => IJepaPreTrainedModel,
24667
25619
  Idefics3ForConditionalGeneration: () => Idefics3ForConditionalGeneration,
24668
- Idefics3PreTrainedModel: () => Idefics3PreTrainedModel,
24669
25620
  JAISLMHeadModel: () => JAISLMHeadModel,
24670
25621
  JAISModel: () => JAISModel,
24671
25622
  JAISPreTrainedModel: () => JAISPreTrainedModel,
@@ -24679,6 +25630,8 @@ __export(models_exports, {
24679
25630
  Lfm2MoeModel: () => Lfm2MoeModel,
24680
25631
  Lfm2MoePreTrainedModel: () => Lfm2MoePreTrainedModel,
24681
25632
  Lfm2PreTrainedModel: () => Lfm2PreTrainedModel,
25633
+ Lfm2VlForConditionalGeneration: () => Lfm2VlForConditionalGeneration,
25634
+ LightOnOcrForConditionalGeneration: () => LightOnOcrForConditionalGeneration,
24682
25635
  LiteWhisperForConditionalGeneration: () => LiteWhisperForConditionalGeneration,
24683
25636
  Llama4ForCausalLM: () => Llama4ForCausalLM,
24684
25637
  Llama4PreTrainedModel: () => Llama4PreTrainedModel,
@@ -24728,6 +25681,9 @@ __export(models_exports, {
24728
25681
  MimiEncoderOutput: () => MimiEncoderOutput,
24729
25682
  MimiModel: () => MimiModel,
24730
25683
  MimiPreTrainedModel: () => MimiPreTrainedModel,
25684
+ Mistral4ForCausalLM: () => Mistral4ForCausalLM,
25685
+ Mistral4Model: () => Mistral4Model,
25686
+ Mistral4PreTrainedModel: () => Mistral4PreTrainedModel,
24731
25687
  MistralForCausalLM: () => MistralForCausalLM,
24732
25688
  MistralModel: () => MistralModel,
24733
25689
  MistralPreTrainedModel: () => MistralPreTrainedModel,
@@ -24785,6 +25741,9 @@ __export(models_exports, {
24785
25741
  NanoChatForCausalLM: () => NanoChatForCausalLM,
24786
25742
  NanoChatModel: () => NanoChatModel,
24787
25743
  NanoChatPreTrainedModel: () => NanoChatPreTrainedModel,
25744
+ NemotronHForCausalLM: () => NemotronHForCausalLM,
25745
+ NemotronHModel: () => NemotronHModel,
25746
+ NemotronHPreTrainedModel: () => NemotronHPreTrainedModel,
24788
25747
  NeoBertForMaskedLM: () => NeoBertForMaskedLM,
24789
25748
  NeoBertForQuestionAnswering: () => NeoBertForQuestionAnswering,
24790
25749
  NeoBertForSequenceClassification: () => NeoBertForSequenceClassification,
@@ -24818,7 +25777,6 @@ __export(models_exports, {
24818
25777
  Owlv2Model: () => Owlv2Model,
24819
25778
  Owlv2PreTrainedModel: () => Owlv2PreTrainedModel,
24820
25779
  PaliGemmaForConditionalGeneration: () => PaliGemmaForConditionalGeneration,
24821
- PaliGemmaPreTrainedModel: () => PaliGemmaPreTrainedModel,
24822
25780
  ParakeetForCTC: () => ParakeetForCTC,
24823
25781
  ParakeetPreTrainedModel: () => ParakeetPreTrainedModel,
24824
25782
  PatchTSMixerForPrediction: () => PatchTSMixerForPrediction,
@@ -24848,8 +25806,10 @@ __export(models_exports, {
24848
25806
  Qwen2MoeModel: () => Qwen2MoeModel,
24849
25807
  Qwen2MoePreTrainedModel: () => Qwen2MoePreTrainedModel,
24850
25808
  Qwen2PreTrainedModel: () => Qwen2PreTrainedModel,
25809
+ Qwen2VLForCausalLM: () => Qwen2VLForCausalLM,
24851
25810
  Qwen2VLForConditionalGeneration: () => Qwen2VLForConditionalGeneration,
24852
25811
  Qwen2VLPreTrainedModel: () => Qwen2VLPreTrainedModel,
25812
+ Qwen2_5_VLForCausalLM: () => Qwen2_5_VLForCausalLM,
24853
25813
  Qwen2_5_VLForConditionalGeneration: () => Qwen2_5_VLForConditionalGeneration,
24854
25814
  Qwen3ForCausalLM: () => Qwen3ForCausalLM,
24855
25815
  Qwen3Model: () => Qwen3Model,
@@ -24860,9 +25820,13 @@ __export(models_exports, {
24860
25820
  Qwen3NextModel: () => Qwen3NextModel,
24861
25821
  Qwen3NextPreTrainedModel: () => Qwen3NextPreTrainedModel,
24862
25822
  Qwen3PreTrainedModel: () => Qwen3PreTrainedModel,
25823
+ Qwen3VLForCausalLM: () => Qwen3VLForCausalLM,
24863
25824
  Qwen3VLForConditionalGeneration: () => Qwen3VLForConditionalGeneration,
25825
+ Qwen3VLMoeForCausalLM: () => Qwen3VLMoeForCausalLM,
24864
25826
  Qwen3VLMoeForConditionalGeneration: () => Qwen3VLMoeForConditionalGeneration,
25827
+ Qwen3_5ForCausalLM: () => Qwen3_5ForCausalLM,
24865
25828
  Qwen3_5ForConditionalGeneration: () => Qwen3_5ForConditionalGeneration,
25829
+ Qwen3_5MoeForCausalLM: () => Qwen3_5MoeForCausalLM,
24866
25830
  Qwen3_5MoeForConditionalGeneration: () => Qwen3_5MoeForConditionalGeneration,
24867
25831
  RFDetrForObjectDetection: () => RFDetrForObjectDetection,
24868
25832
  RFDetrModel: () => RFDetrModel,
@@ -24913,11 +25877,13 @@ __export(models_exports, {
24913
25877
  SmolLM3ForCausalLM: () => SmolLM3ForCausalLM,
24914
25878
  SmolLM3Model: () => SmolLM3Model,
24915
25879
  SmolLM3PreTrainedModel: () => SmolLM3PreTrainedModel,
24916
- SmolVLMForConditionalGeneration: () => SmolVLMForConditionalGeneration,
24917
25880
  SnacDecoderModel: () => SnacDecoderModel,
24918
25881
  SnacEncoderModel: () => SnacEncoderModel,
24919
25882
  SnacModel: () => SnacModel,
24920
25883
  SnacPreTrainedModel: () => SnacPreTrainedModel,
25884
+ SolarOpenForCausalLM: () => SolarOpenForCausalLM,
25885
+ SolarOpenModel: () => SolarOpenModel,
25886
+ SolarOpenPreTrainedModel: () => SolarOpenPreTrainedModel,
24921
25887
  SpeechT5ForSpeechToText: () => SpeechT5ForSpeechToText,
24922
25888
  SpeechT5ForTextToSpeech: () => SpeechT5ForTextToSpeech,
24923
25889
  SpeechT5HifiGan: () => SpeechT5HifiGan,
@@ -24985,6 +25951,8 @@ __export(models_exports, {
24985
25951
  VitsModelOutput: () => VitsModelOutput,
24986
25952
  VitsPreTrainedModel: () => VitsPreTrainedModel,
24987
25953
  VoxtralForConditionalGeneration: () => VoxtralForConditionalGeneration,
25954
+ VoxtralRealtimeForConditionalGeneration: () => VoxtralRealtimeForConditionalGeneration,
25955
+ VoxtralRealtimePreTrainedModel: () => VoxtralRealtimePreTrainedModel,
24988
25956
  Wav2Vec2BertForCTC: () => Wav2Vec2BertForCTC,
24989
25957
  Wav2Vec2BertForSequenceClassification: () => Wav2Vec2BertForSequenceClassification,
24990
25958
  Wav2Vec2BertModel: () => Wav2Vec2BertModel,
@@ -25090,7 +26058,7 @@ var ArceeModel = class extends ArceePreTrainedModel {
25090
26058
  var ArceeForCausalLM = class extends ArceePreTrainedModel {
25091
26059
  };
25092
26060
 
25093
- // src/models/ast/modeling_ast.js
26061
+ // src/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.js
25094
26062
  var ASTPreTrainedModel = class extends PreTrainedModel {
25095
26063
  };
25096
26064
  var ASTModel = class extends ASTPreTrainedModel {
@@ -25345,7 +26313,7 @@ var ChatterboxModel = class extends ChatterboxPreTrainedModel {
25345
26313
  if (!past_key_values || target_length !== 1) {
25346
26314
  throw new Error("Incorrect state encountered during generation.");
25347
26315
  }
25348
- const past_length = Object.values(past_key_values)[0].dims.at(-2);
26316
+ const past_length = past_key_values.get_seq_length();
25349
26317
  attention_mask = ones([inputs_embeds.dims[0], past_length + target_length]);
25350
26318
  }
25351
26319
  }
@@ -25425,6 +26393,12 @@ var ChineseCLIPPreTrainedModel = class extends PreTrainedModel {
25425
26393
  var ChineseCLIPModel = class extends ChineseCLIPPreTrainedModel {
25426
26394
  };
25427
26395
 
26396
+ // src/models/chmv2/modeling_chmv2.js
26397
+ var CHMv2PreTrainedModel = class extends PreTrainedModel {
26398
+ };
26399
+ var CHMv2ForDepthEstimation = class extends CHMv2PreTrainedModel {
26400
+ };
26401
+
25428
26402
  // src/models/clap/modeling_clap.js
25429
26403
  var ClapPreTrainedModel = class extends PreTrainedModel {
25430
26404
  };
@@ -25763,6 +26737,14 @@ var DebertaForQuestionAnswering = class extends DebertaPreTrainedModel {
25763
26737
  }
25764
26738
  };
25765
26739
 
26740
+ // src/models/deepseek_v3/modeling_deepseek_v3.js
26741
+ var DeepseekV3PreTrainedModel = class extends PreTrainedModel {
26742
+ };
26743
+ var DeepseekV3Model = class extends DeepseekV3PreTrainedModel {
26744
+ };
26745
+ var DeepseekV3ForCausalLM = class extends DeepseekV3PreTrainedModel {
26746
+ };
26747
+
25766
26748
  // src/models/deberta_v2/modeling_deberta_v2.js
25767
26749
  var DebertaV2PreTrainedModel = class extends PreTrainedModel {
25768
26750
  };
@@ -26111,6 +27093,45 @@ var EsmForTokenClassification = class extends EsmPreTrainedModel {
26111
27093
  }
26112
27094
  };
26113
27095
 
27096
+ // src/models/eurobert/modeling_eurobert.js
27097
+ var EuroBertPreTrainedModel = class extends PreTrainedModel {
27098
+ };
27099
+ var EuroBertModel = class extends EuroBertPreTrainedModel {
27100
+ };
27101
+ var EuroBertForMaskedLM = class extends EuroBertPreTrainedModel {
27102
+ /**
27103
+ * Calls the model on new inputs.
27104
+ *
27105
+ * @param {Object} model_inputs The inputs to the model.
27106
+ * @returns {Promise<MaskedLMOutput>} An object containing the model's output logits for masked language modeling.
27107
+ */
27108
+ async _call(model_inputs) {
27109
+ return new MaskedLMOutput(await super._call(model_inputs));
27110
+ }
27111
+ };
27112
+ var EuroBertForSequenceClassification = class extends EuroBertPreTrainedModel {
27113
+ /**
27114
+ * Calls the model on new inputs.
27115
+ *
27116
+ * @param {Object} model_inputs The inputs to the model.
27117
+ * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
27118
+ */
27119
+ async _call(model_inputs) {
27120
+ return new SequenceClassifierOutput(await super._call(model_inputs));
27121
+ }
27122
+ };
27123
+ var EuroBertForTokenClassification = class extends EuroBertPreTrainedModel {
27124
+ /**
27125
+ * Calls the model on new inputs.
27126
+ *
27127
+ * @param {Object} model_inputs The inputs to the model.
27128
+ * @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
27129
+ */
27130
+ async _call(model_inputs) {
27131
+ return new TokenClassifierOutput(await super._call(model_inputs));
27132
+ }
27133
+ };
27134
+
26114
27135
  // src/models/exaone/modeling_exaone.js
26115
27136
  var ExaonePreTrainedModel = class extends PreTrainedModel {
26116
27137
  };
@@ -26375,6 +27396,8 @@ var Gemma3nForConditionalGeneration = class extends Gemma3nPreTrainedModel {
26375
27396
  });
26376
27397
  }
26377
27398
  };
27399
+ var Gemma3nForCausalLM = class extends Gemma3nForConditionalGeneration {
27400
+ };
26378
27401
 
26379
27402
  // src/models/glm/modeling_glm.js
26380
27403
  var GlmPreTrainedModel = class extends PreTrainedModel {
@@ -26384,6 +27407,377 @@ var GlmModel = class extends GlmPreTrainedModel {
26384
27407
  var GlmForCausalLM = class extends GlmPreTrainedModel {
26385
27408
  };
26386
27409
 
27410
+ // src/models/glm_moe_dsa/modeling_glm_moe_dsa.js
27411
+ var GlmMoeDsaPreTrainedModel = class extends PreTrainedModel {
27412
+ };
27413
+ var GlmMoeDsaModel = class extends GlmMoeDsaPreTrainedModel {
27414
+ };
27415
+ var GlmMoeDsaForCausalLM = class extends GlmMoeDsaPreTrainedModel {
27416
+ };
27417
+
27418
+ // src/models/qwen2_vl/modeling_qwen2_vl.js
27419
+ var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
27420
+ forward_params = [
27421
+ // Text inputs
27422
+ "input_ids",
27423
+ "attention_mask",
27424
+ "position_ids",
27425
+ "past_key_values",
27426
+ // Vision inputs
27427
+ "pixel_values",
27428
+ "image_grid_thw"
27429
+ ];
27430
+ };
27431
+ var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
27432
+ // NOTE: This is used as the base class for all Qwen VL models and their CausalLM variants.
27433
+ // CausalLM variants (e.g., Qwen2VLForCausalLM) extend this class but load only
27434
+ // embed_tokens + decoder_model_merged (no vision_encoder) via MultimodalLanguageModelOnly type.
27435
+ image_grid_thw_name = "grid_thw";
27436
+ /**
27437
+ * Compute text-only 3D rope position IDs (all 3 dims get the same 1D positions).
27438
+ * @param {Tensor} input_ids
27439
+ * @param {Tensor} attention_mask
27440
+ * @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas]
27441
+ */
27442
+ _get_text_only_rope_index(input_ids, attention_mask) {
27443
+ if (attention_mask) {
27444
+ const { data, dims } = cumsum_masked_fill(attention_mask);
27445
+ const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]);
27446
+ const mrope_position_deltas = Array.from(
27447
+ { length: dims[0] },
27448
+ (_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
27449
+ );
27450
+ return [
27451
+ new Tensor2("int64", position_ids, [3, ...dims]),
27452
+ new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
27453
+ ];
27454
+ } else {
27455
+ const [batch_size, seq_length] = input_ids.dims;
27456
+ const position_ids = BigInt64Array.from(
27457
+ { length: 3 * batch_size * seq_length },
27458
+ (_, i) => BigInt(Math.floor(i % seq_length / batch_size))
27459
+ );
27460
+ return [new Tensor2("int64", position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])];
27461
+ }
27462
+ }
27463
+ /**
27464
+ * Reorder per-segment position ID lists from [seg1[t,h,w], seg2[t,h,w], ...] into
27465
+ * global [all_t, all_h, all_w] order, then write back into the position_ids array
27466
+ * respecting attention mask.
27467
+ * @param {number[][]} llm_pos_ids_list List of per-segment position arrays, each of length 3*seg_len
27468
+ * @param {number[]} attn_mask Attention mask for this batch element
27469
+ * @param {number[][][]} position_ids_list [3][batch][seq] output array to write into
27470
+ * @param {number} batch_idx Current batch index
27471
+ * @returns {number[]} Flat reordered positions of length total_len
27472
+ */
27473
+ _reorder_and_write_positions(llm_pos_ids_list, attn_mask, position_ids_list, batch_idx) {
27474
+ const total_len = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
27475
+ const llm_positions = new Array(total_len);
27476
+ let index = 0;
27477
+ for (let x = 0; x < 3; ++x) {
27478
+ for (const val of llm_pos_ids_list) {
27479
+ const seg_len = val.length / 3;
27480
+ for (let z = x * seg_len; z < (x + 1) * seg_len; ++z) {
27481
+ llm_positions[index++] = val[z];
27482
+ }
27483
+ }
27484
+ }
27485
+ let count2 = 0;
27486
+ for (let y = 0; y < attn_mask.length; ++y) {
27487
+ if (attn_mask[y] == 1) {
27488
+ for (let x = 0; x < 3; ++x) {
27489
+ position_ids_list[x][batch_idx][y] = llm_positions[x * total_len / 3 + count2];
27490
+ }
27491
+ ++count2;
27492
+ }
27493
+ }
27494
+ return llm_positions;
27495
+ }
27496
+ /**
27497
+ * Build per-batch position ID segments for multimodal rope.
27498
+ * Override this in subclasses to change how vision/text segments are identified and positioned.
27499
+ * @param {object} params
27500
+ * @param {any[]} params.filtered_ids - attention-masked token IDs for this batch element
27501
+ * @param {any[][]} params.image_grid_thw_list - all image grid dimensions
27502
+ * @param {any[][]} params.video_grid_thw_list - all video grid dimensions
27503
+ * @param {number} params.spatial_merge_size
27504
+ * @param {{image_index: number, video_index: number}} params.state - mutable counters shared across batches
27505
+ * @returns {number[][]} llm_pos_ids_list - segments of [t..., h..., w...] positions
27506
+ */
27507
+ _get_multimodal_rope_positions({
27508
+ filtered_ids,
27509
+ image_grid_thw_list,
27510
+ video_grid_thw_list,
27511
+ spatial_merge_size,
27512
+ state
27513
+ }) {
27514
+ const { image_token_id, video_token_id, vision_start_token_id } = this.config;
27515
+ const ids = filtered_ids;
27516
+ const vision_start_indices = ids.reduce((acc, x, idx) => {
27517
+ if (x == vision_start_token_id) acc.push(idx);
27518
+ return acc;
27519
+ }, []);
27520
+ const vision_tokens = vision_start_indices.map((x) => ids[x + 1]);
27521
+ const image_nums = vision_tokens.filter((x) => x == image_token_id).length;
27522
+ const video_nums = vision_tokens.filter((x) => x == video_token_id).length;
27523
+ const llm_pos_ids_list = [];
27524
+ let st2 = 0;
27525
+ let remain_images = image_nums;
27526
+ let remain_videos = video_nums;
27527
+ for (let j = 0; j < vision_tokens.length; ++j) {
27528
+ const next_image_token = ids.findIndex((x, i) => i > st2 && x == image_token_id);
27529
+ const next_video_token = ids.findIndex((x, i) => i > st2 && x == video_token_id);
27530
+ const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1;
27531
+ const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1;
27532
+ let ed;
27533
+ let t, h, w;
27534
+ if (ed_image < ed_video) {
27535
+ [t, h, w] = image_grid_thw_list[state.image_index];
27536
+ ++state.image_index;
27537
+ --remain_images;
27538
+ ed = ed_image;
27539
+ } else {
27540
+ [t, h, w] = video_grid_thw_list[state.video_index];
27541
+ ++state.video_index;
27542
+ --remain_videos;
27543
+ ed = ed_video;
27544
+ }
27545
+ const [llm_grid_t, llm_grid_h, llm_grid_w] = [
27546
+ Number(t),
27547
+ Math.floor(Number(h) / spatial_merge_size),
27548
+ Math.floor(Number(w) / spatial_merge_size)
27549
+ ];
27550
+ const text_len = ed - st2;
27551
+ const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
27552
+ llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + i % text_len));
27553
+ const offset = text_len + st_idx;
27554
+ const grid_size = llm_grid_t * llm_grid_h * llm_grid_w;
27555
+ const t_index = Array.from(
27556
+ { length: grid_size },
27557
+ (_, i) => offset + Math.floor(i / (llm_grid_h * llm_grid_w))
27558
+ );
27559
+ const h_index = Array.from(
27560
+ { length: grid_size },
27561
+ (_, i) => offset + Math.floor(i / llm_grid_w) % llm_grid_h
27562
+ );
27563
+ const w_index = Array.from({ length: grid_size }, (_, i) => offset + i % llm_grid_w);
27564
+ llm_pos_ids_list.push([t_index, h_index, w_index].flat());
27565
+ st2 = ed + grid_size;
27566
+ }
27567
+ if (st2 < ids.length) {
27568
+ const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
27569
+ const text_len = ids.length - st2;
27570
+ llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + i % text_len));
27571
+ }
27572
+ return llm_pos_ids_list;
27573
+ }
27574
+ /**
27575
+ * Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
27576
+ *
27577
+ * Explanation:
27578
+ * Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
27579
+ *
27580
+ * For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
27581
+ * Examples:
27582
+ * input_ids: [T T T T T], here T is for text.
27583
+ * temporal position_ids: [0, 1, 2, 3, 4]
27584
+ * height position_ids: [0, 1, 2, 3, 4]
27585
+ * width position_ids: [0, 1, 2, 3, 4]
27586
+ *
27587
+ * For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
27588
+ * and 1D rotary position embeddin for text part.
27589
+ * Examples:
27590
+ * Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
27591
+ * input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
27592
+ * vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
27593
+ * vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
27594
+ * vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
27595
+ * text temporal position_ids: [3, 4, 5, 6, 7]
27596
+ * text height position_ids: [3, 4, 5, 6, 7]
27597
+ * text width position_ids: [3, 4, 5, 6, 7]
27598
+ * Here we calculate the text start position_ids as the max vision position_ids plus 1.
27599
+ *
27600
+ * @param {Tensor} input_ids Indices of input sequence tokens in the vocabulary. Tensor of shape `(batch_size, sequence_length)`.
27601
+ * @param {Tensor} image_grid_thw (Optional) The temporal, height and width of feature shape of each image in LLM. Tensor of shape `(num_images, 3)`.
27602
+ * @param {Tensor} video_grid_thw (Optional) The temporal, height and width of feature shape of each video in LLM. Tensor of shape `(num_videos, 3)`.
27603
+ * @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`.
27604
+ * @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas]
27605
+ */
27606
+ get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) {
27607
+ const { vision_config } = this.config;
27608
+ const spatial_merge_size = vision_config.spatial_merge_size ?? 2;
27609
+ if (image_grid_thw || video_grid_thw) {
27610
+ const total_input_ids = input_ids.tolist();
27611
+ if (!attention_mask) {
27612
+ attention_mask = ones_like(input_ids);
27613
+ }
27614
+ const attention_mask_list = attention_mask.tolist();
27615
+ const position_ids_list = Array.from(
27616
+ { length: 3 },
27617
+ () => Array.from({ length: input_ids.dims[0] }, () => Array.from({ length: input_ids.dims[1] }, () => 0))
27618
+ );
27619
+ const image_grid_thw_list = image_grid_thw ? image_grid_thw.tolist() : [];
27620
+ const video_grid_thw_list = video_grid_thw ? video_grid_thw.tolist() : [];
27621
+ const state = { image_index: 0, video_index: 0 };
27622
+ const mrope_position_deltas = [];
27623
+ for (let i = 0; i < total_input_ids.length; ++i) {
27624
+ const filtered_ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1);
27625
+ const llm_pos_ids_list = this._get_multimodal_rope_positions({
27626
+ filtered_ids,
27627
+ image_grid_thw_list,
27628
+ video_grid_thw_list,
27629
+ spatial_merge_size,
27630
+ state
27631
+ });
27632
+ const llm_positions = this._reorder_and_write_positions(
27633
+ llm_pos_ids_list,
27634
+ attention_mask_list[i],
27635
+ position_ids_list,
27636
+ i
27637
+ );
27638
+ mrope_position_deltas.push(max(llm_positions)[0] + 1 - total_input_ids[i].length);
27639
+ }
27640
+ return [
27641
+ new Tensor2("int64", position_ids_list.flat(Infinity), [3, input_ids.dims[0], input_ids.dims[1]]),
27642
+ new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
27643
+ ];
27644
+ } else {
27645
+ return this._get_text_only_rope_index(input_ids, attention_mask);
27646
+ }
27647
+ }
27648
+ async encode_image({ pixel_values, image_grid_thw }) {
27649
+ const features = (await sessionRun(this.sessions["vision_encoder"], {
27650
+ pixel_values,
27651
+ [this.image_grid_thw_name]: image_grid_thw
27652
+ })).image_features;
27653
+ return features;
27654
+ }
27655
+ _merge_input_ids_with_image_features(kwargs) {
27656
+ return default_merge_input_ids_with_image_features({
27657
+ // @ts-ignore
27658
+ image_token_id: this.config.image_token_id,
27659
+ ...kwargs
27660
+ });
27661
+ }
27662
+ prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
27663
+ if (model_inputs.attention_mask && !model_inputs.position_ids) {
27664
+ if (!model_inputs.past_key_values) {
27665
+ [model_inputs.position_ids, model_inputs.rope_deltas] = this.get_rope_index(
27666
+ model_inputs.input_ids,
27667
+ model_inputs.image_grid_thw,
27668
+ model_inputs.video_grid_thw,
27669
+ model_inputs.attention_mask
27670
+ );
27671
+ } else {
27672
+ model_inputs.pixel_values = null;
27673
+ const past_length = model_inputs.past_key_values.get_seq_length();
27674
+ if (past_length < model_inputs.input_ids.dims[1]) {
27675
+ const [full_position_ids, rope_deltas] = this.get_rope_index(
27676
+ model_inputs.input_ids,
27677
+ model_inputs.image_grid_thw,
27678
+ model_inputs.video_grid_thw,
27679
+ model_inputs.attention_mask
27680
+ );
27681
+ model_inputs.rope_deltas = rope_deltas;
27682
+ model_inputs.position_ids = full_position_ids.slice(null, null, [past_length, null]);
27683
+ model_inputs.input_ids = model_inputs.input_ids.slice(null, [past_length, null]);
27684
+ } else {
27685
+ if (!model_inputs.rope_deltas) {
27686
+ [, model_inputs.rope_deltas] = this.get_rope_index(
27687
+ model_inputs.input_ids,
27688
+ model_inputs.image_grid_thw,
27689
+ model_inputs.video_grid_thw,
27690
+ model_inputs.attention_mask
27691
+ );
27692
+ }
27693
+ const delta = BigInt(past_length);
27694
+ const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x);
27695
+ model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0);
27696
+ }
27697
+ }
27698
+ }
27699
+ return model_inputs;
27700
+ }
27701
+ };
27702
+ var Qwen2VLForCausalLM = class extends Qwen2VLForConditionalGeneration {
27703
+ };
27704
+
27705
+ // src/models/qwen2_5_vl/modeling_qwen2_5_vl.js
27706
+ var Qwen2_5_VLForConditionalGeneration = class extends Qwen2VLForConditionalGeneration {
27707
+ image_grid_thw_name = "image_grid_thw";
27708
+ };
27709
+ var Qwen2_5_VLForCausalLM = class extends Qwen2VLForCausalLM {
27710
+ image_grid_thw_name = "image_grid_thw";
27711
+ };
27712
+
27713
+ // src/models/glm_ocr/modeling_glm_ocr.js
27714
+ var GlmOcrForConditionalGeneration = class extends Qwen2_5_VLForConditionalGeneration {
27715
+ /**
27716
+ * Compute 3D positional indices for vision tokens.
27717
+ * Temporal is constant, height is repeat-interleaved, width tiles.
27718
+ * @param {number} start_position
27719
+ * @param {number[]} grid_thw [T, H, W]
27720
+ * @param {number} temp_merge_size
27721
+ * @param {number} spatial_merge_size
27722
+ * @returns {number[]} Flat array of length 3 * seq_len: [temporal..., height..., width...]
27723
+ */
27724
+ get_vision_position_ids(start_position, grid_thw, temp_merge_size, spatial_merge_size) {
27725
+ const llm_grid_t = Math.floor(grid_thw[0] / temp_merge_size);
27726
+ const llm_grid_h = Math.floor(grid_thw[1] / spatial_merge_size);
27727
+ const llm_grid_w = Math.floor(grid_thw[2] / spatial_merge_size);
27728
+ const seq_len = llm_grid_h * llm_grid_w * llm_grid_t;
27729
+ const t_pos = Array.from({ length: seq_len }, () => start_position);
27730
+ const h_pos = Array.from(
27731
+ { length: seq_len },
27732
+ (_, i) => start_position + Math.floor(i / (llm_grid_w * llm_grid_t))
27733
+ );
27734
+ const w_pos = Array.from({ length: seq_len }, (_, i) => start_position + i % llm_grid_w);
27735
+ return [...t_pos, ...h_pos, ...w_pos];
27736
+ }
27737
+ /**
27738
+ * GlmOcr uses mm_token_type_ids-style grouping (image tokens identified by image_token_id)
27739
+ * instead of vision_start_token_id scanning used by Qwen2VL.
27740
+ * After a vision segment, position advances by max(h, w) / spatial_merge_size.
27741
+ */
27742
+ _get_multimodal_rope_positions({
27743
+ filtered_ids,
27744
+ image_grid_thw_list,
27745
+ video_grid_thw_list,
27746
+ spatial_merge_size,
27747
+ state
27748
+ }) {
27749
+ const { image_token_id } = this.config;
27750
+ const groups = [];
27751
+ let group_start = 0;
27752
+ let current_type = filtered_ids[0] == image_token_id ? 1 : 0;
27753
+ for (let j = 1; j <= filtered_ids.length; ++j) {
27754
+ const t = j < filtered_ids.length ? filtered_ids[j] == image_token_id ? 1 : 0 : -1;
27755
+ if (t !== current_type) {
27756
+ groups.push([current_type, group_start, j]);
27757
+ group_start = j;
27758
+ current_type = t;
27759
+ }
27760
+ }
27761
+ let current_pos = 0;
27762
+ const llm_pos_ids_list = [];
27763
+ for (const [modality_type, start_idx, end_idx] of groups) {
27764
+ if (modality_type === 0) {
27765
+ const text_len = end_idx - start_idx;
27766
+ llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => current_pos + i % text_len));
27767
+ current_pos += text_len;
27768
+ } else {
27769
+ const grid_thw = image_grid_thw_list[state.image_index++].map(Number);
27770
+ const temp_merge_size = grid_thw[0];
27771
+ llm_pos_ids_list.push(
27772
+ this.get_vision_position_ids(current_pos, grid_thw, temp_merge_size, spatial_merge_size)
27773
+ );
27774
+ current_pos += Math.max(grid_thw[1], grid_thw[2]) / spatial_merge_size;
27775
+ }
27776
+ }
27777
+ return llm_pos_ids_list;
27778
+ }
27779
+ };
27780
+
26387
27781
  // src/models/glpn/modeling_glpn.js
26388
27782
  var GLPNPreTrainedModel = class extends PreTrainedModel {
26389
27783
  };
@@ -26456,6 +27850,28 @@ var GraniteMoeHybridModel = class extends GraniteMoeHybridPreTrainedModel {
26456
27850
  var GraniteMoeHybridForCausalLM = class extends GraniteMoeHybridPreTrainedModel {
26457
27851
  };
26458
27852
 
27853
+ // src/models/ultravox/modeling_ultravox.js
27854
+ var UltravoxPreTrainedModel = class extends PreTrainedModel {
27855
+ forward_params = ["input_ids", "attention_mask", "position_ids", "audio_values", "past_key_values"];
27856
+ };
27857
+ var UltravoxModel = class extends UltravoxPreTrainedModel {
27858
+ _merge_input_ids_with_audio_features(kwargs) {
27859
+ const audio_hidden_size = kwargs.audio_features.dims.at(-1);
27860
+ const reshaped_audio_features = kwargs.audio_features.view(-1, audio_hidden_size);
27861
+ return default_merge_input_ids_with_audio_features({
27862
+ // @ts-ignore
27863
+ audio_token_id: this.config.ignore_index ?? this.config.audio_token_id ?? this.config.audio_token_index,
27864
+ ...kwargs,
27865
+ audio_features: reshaped_audio_features
27866
+ });
27867
+ }
27868
+ };
27869
+
27870
+ // src/models/granite_speech/modeling_granite_speech.js
27871
+ var GraniteSpeechForConditionalGeneration = class extends UltravoxModel {
27872
+ forward_params = ["input_ids", "attention_mask", "input_features", "past_key_values"];
27873
+ };
27874
+
26459
27875
  // src/models/grounding_dino/modeling_grounding_dino.js
26460
27876
  var GroundingDinoPreTrainedModel = class extends PreTrainedModel {
26461
27877
  };
@@ -26560,34 +27976,37 @@ var HunYuanDenseV1Model = class extends HunYuanDenseV1PreTrainedModel {
26560
27976
  var HunYuanDenseV1ForCausalLM = class extends HunYuanDenseV1PreTrainedModel {
26561
27977
  };
26562
27978
 
26563
- // src/models/idefics3/modeling_idefics3.js
26564
- var Idefics3PreTrainedModel = class extends PreTrainedModel {
26565
- forward_params = [
26566
- "input_ids",
26567
- "attention_mask",
26568
- "pixel_values",
26569
- "pixel_attention_mask",
26570
- "position_ids",
26571
- "past_key_values"
26572
- ];
27979
+ // src/models/llava/modeling_llava.js
27980
+ var LlavaPreTrainedModel = class extends PreTrainedModel {
27981
+ forward_params = ["input_ids", "attention_mask", "pixel_values", "position_ids", "past_key_values"];
26573
27982
  };
26574
- var Idefics3ForConditionalGeneration = class extends Idefics3PreTrainedModel {
26575
- async encode_image({ pixel_values, pixel_attention_mask }) {
26576
- const features = (await sessionRun(this.sessions["vision_encoder"], { pixel_values, pixel_attention_mask })).image_features;
26577
- return features;
26578
- }
27983
+ var LlavaForConditionalGeneration = class extends LlavaPreTrainedModel {
26579
27984
  _merge_input_ids_with_image_features(kwargs) {
26580
27985
  const vision_hidden_size = kwargs.image_features.dims.at(-1);
26581
27986
  const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
26582
27987
  return default_merge_input_ids_with_image_features({
26583
27988
  // @ts-ignore
26584
- image_token_id: this.config.image_token_id,
27989
+ image_token_id: this.config.image_token_index ?? this.config.image_token_id,
26585
27990
  ...kwargs,
26586
27991
  image_features: reshaped_image_hidden_states
26587
27992
  });
26588
27993
  }
26589
27994
  };
26590
- var SmolVLMForConditionalGeneration = class extends Idefics3ForConditionalGeneration {
27995
+ var Moondream1ForConditionalGeneration = class extends LlavaForConditionalGeneration {
27996
+ };
27997
+ var LlavaQwen2ForCausalLM = class extends LlavaForConditionalGeneration {
27998
+ };
27999
+
28000
+ // src/models/idefics3/modeling_idefics3.js
28001
+ var Idefics3ForConditionalGeneration = class extends LlavaForConditionalGeneration {
28002
+ forward_params = [
28003
+ "input_ids",
28004
+ "attention_mask",
28005
+ "pixel_values",
28006
+ "pixel_attention_mask",
28007
+ "position_ids",
28008
+ "past_key_values"
28009
+ ];
26591
28010
  };
26592
28011
 
26593
28012
  // src/models/ijepa/modeling_ijepa.js
@@ -26671,6 +28090,10 @@ var Lfm2Model = class extends Lfm2PreTrainedModel {
26671
28090
  var Lfm2ForCausalLM = class extends Lfm2PreTrainedModel {
26672
28091
  };
26673
28092
 
28093
+ // src/models/lighton_ocr/modeling_lighton_ocr.js
28094
+ var LightOnOcrForConditionalGeneration = class extends LlavaForConditionalGeneration {
28095
+ };
28096
+
26674
28097
  // src/models/lfm2_moe/modeling_lfm2_moe.js
26675
28098
  var Lfm2MoePreTrainedModel = class extends PreTrainedModel {
26676
28099
  };
@@ -26679,6 +28102,19 @@ var Lfm2MoeModel = class extends Lfm2MoePreTrainedModel {
26679
28102
  var Lfm2MoeForCausalLM = class extends Lfm2MoePreTrainedModel {
26680
28103
  };
26681
28104
 
28105
+ // src/models/lfm2_vl/modeling_lfm2_vl.js
28106
+ var Lfm2VlForConditionalGeneration = class extends LlavaForConditionalGeneration {
28107
+ forward_params = [
28108
+ "input_ids",
28109
+ "attention_mask",
28110
+ "pixel_values",
28111
+ "pixel_attention_mask",
28112
+ "spatial_shapes",
28113
+ "position_ids",
28114
+ "past_key_values"
28115
+ ];
28116
+ };
28117
+
26682
28118
  // src/models/llama/modeling_llama.js
26683
28119
  var LlamaPreTrainedModel = class extends PreTrainedModel {
26684
28120
  };
@@ -26693,27 +28129,6 @@ var Llama4PreTrainedModel = class extends PreTrainedModel {
26693
28129
  var Llama4ForCausalLM = class extends Llama4PreTrainedModel {
26694
28130
  };
26695
28131
 
26696
- // src/models/llava/modeling_llava.js
26697
- var LlavaPreTrainedModel = class extends PreTrainedModel {
26698
- forward_params = ["input_ids", "attention_mask", "pixel_values", "position_ids", "past_key_values"];
26699
- };
26700
- var LlavaForConditionalGeneration = class extends LlavaPreTrainedModel {
26701
- _merge_input_ids_with_image_features(kwargs) {
26702
- const vision_hidden_size = kwargs.image_features.dims.at(-1);
26703
- const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
26704
- return default_merge_input_ids_with_image_features({
26705
- // @ts-ignore
26706
- image_token_id: this.config.image_token_index,
26707
- ...kwargs,
26708
- image_features: reshaped_image_hidden_states
26709
- });
26710
- }
26711
- };
26712
- var Moondream1ForConditionalGeneration = class extends LlavaForConditionalGeneration {
26713
- };
26714
- var LlavaQwen2ForCausalLM = class extends LlavaForConditionalGeneration {
26715
- };
26716
-
26717
28132
  // src/models/longt5/modeling_longt5.js
26718
28133
  var LongT5PreTrainedModel = class extends PreTrainedModel {
26719
28134
  };
@@ -26875,6 +28290,14 @@ var MistralModel = class extends MistralPreTrainedModel {
26875
28290
  var MistralForCausalLM = class extends MistralPreTrainedModel {
26876
28291
  };
26877
28292
 
28293
+ // src/models/mistral4/modeling_mistral4.js
28294
+ var Mistral4PreTrainedModel = class extends PreTrainedModel {
28295
+ };
28296
+ var Mistral4Model = class extends Mistral4PreTrainedModel {
28297
+ };
28298
+ var Mistral4ForCausalLM = class extends Mistral4PreTrainedModel {
28299
+ };
28300
+
26878
28301
  // src/models/mobilebert/modeling_mobilebert.js
26879
28302
  var MobileBertPreTrainedModel = class extends PreTrainedModel {
26880
28303
  };
@@ -27343,6 +28766,14 @@ var NanoChatModel = class extends NanoChatPreTrainedModel {
27343
28766
  var NanoChatForCausalLM = class extends NanoChatPreTrainedModel {
27344
28767
  };
27345
28768
 
28769
+ // src/models/nemotron_h/modeling_nemotron_h.js
28770
+ var NemotronHPreTrainedModel = class extends PreTrainedModel {
28771
+ };
28772
+ var NemotronHModel = class extends NemotronHPreTrainedModel {
28773
+ };
28774
+ var NemotronHForCausalLM = class extends NemotronHPreTrainedModel {
28775
+ };
28776
+
27346
28777
  // src/models/neobert/modeling_neobert.js
27347
28778
  var NeoBertPreTrainedModel = class extends PreTrainedModel {
27348
28779
  };
@@ -27464,27 +28895,7 @@ var OwlViTForObjectDetection = class extends OwlViTPreTrainedModel {
27464
28895
  };
27465
28896
 
27466
28897
  // src/models/paligemma/modeling_paligemma.js
27467
- var PaliGemmaPreTrainedModel = class extends PreTrainedModel {
27468
- forward_params = [
27469
- "input_ids",
27470
- // 'inputs_embeds',
27471
- "attention_mask",
27472
- "pixel_values",
27473
- "position_ids",
27474
- "past_key_values"
27475
- ];
27476
- };
27477
- var PaliGemmaForConditionalGeneration = class extends PaliGemmaPreTrainedModel {
27478
- _merge_input_ids_with_image_features(kwargs) {
27479
- const vision_hidden_size = kwargs.image_features.dims.at(-1);
27480
- const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
27481
- return default_merge_input_ids_with_image_features({
27482
- // @ts-ignore
27483
- image_token_id: this.config.image_token_index,
27484
- ...kwargs,
27485
- image_features: reshaped_image_hidden_states
27486
- });
27487
- }
28898
+ var PaliGemmaForConditionalGeneration = class extends LlavaForConditionalGeneration {
27488
28899
  };
27489
28900
 
27490
28901
  // src/models/parakeet/modeling_parakeet.js
@@ -27643,244 +29054,6 @@ var Qwen2MoeModel = class extends Qwen2MoePreTrainedModel {
27643
29054
  var Qwen2MoeForCausalLM = class extends Qwen2MoePreTrainedModel {
27644
29055
  };
27645
29056
 
27646
- // src/models/qwen2_vl/modeling_qwen2_vl.js
27647
- var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
27648
- forward_params = [
27649
- // Text inputs
27650
- "input_ids",
27651
- "attention_mask",
27652
- "position_ids",
27653
- "past_key_values",
27654
- // Vision inputs
27655
- "pixel_values",
27656
- "image_grid_thw"
27657
- ];
27658
- };
27659
- var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
27660
- image_grid_thw_name = "grid_thw";
27661
- /**
27662
- * Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
27663
- *
27664
- * Explanation:
27665
- * Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
27666
- *
27667
- * For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
27668
- * Examples:
27669
- * input_ids: [T T T T T], here T is for text.
27670
- * temporal position_ids: [0, 1, 2, 3, 4]
27671
- * height position_ids: [0, 1, 2, 3, 4]
27672
- * width position_ids: [0, 1, 2, 3, 4]
27673
- *
27674
- * For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
27675
- * and 1D rotary position embeddin for text part.
27676
- * Examples:
27677
- * Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
27678
- * input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
27679
- * vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
27680
- * vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
27681
- * vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
27682
- * text temporal position_ids: [3, 4, 5, 6, 7]
27683
- * text height position_ids: [3, 4, 5, 6, 7]
27684
- * text width position_ids: [3, 4, 5, 6, 7]
27685
- * Here we calculate the text start position_ids as the max vision position_ids plus 1.
27686
- *
27687
- * @param {Tensor} input_ids Indices of input sequence tokens in the vocabulary. Tensor of shape `(batch_size, sequence_length)`.
27688
- * @param {Tensor} image_grid_thw (Optional) The temporal, height and width of feature shape of each image in LLM. Tensor of shape `(num_images, 3)`.
27689
- * @param {Tensor} video_grid_thw (Optional) The temporal, height and width of feature shape of each video in LLM. Tensor of shape `(num_videos, 3)`.
27690
- * @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`. Mask values selected in `[0, 1]`:
27691
- * - 1 for tokens that are **not masked**,
27692
- * - 0 for tokens that are **masked**.
27693
- * @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas] with:
27694
- * - position_ids: Tensor of shape `(3, batch_size, sequence_length)`.
27695
- * - mrope_position_deltas: Tensor of shape `(batch_size)`.
27696
- */
27697
- get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) {
27698
- const { vision_config, image_token_id, video_token_id, vision_start_token_id } = this.config;
27699
- const spatial_merge_size = vision_config.spatial_merge_size ?? 2;
27700
- const mrope_position_deltas = [];
27701
- if (image_grid_thw || video_grid_thw) {
27702
- let total_input_ids = input_ids.tolist();
27703
- if (!attention_mask) {
27704
- attention_mask = ones_like(input_ids);
27705
- }
27706
- const attention_mask_list = attention_mask.tolist();
27707
- const position_ids_list = Array.from(
27708
- { length: 3 },
27709
- (_) => Array.from({ length: input_ids.dims[0] }, (_2) => Array.from({ length: input_ids.dims[1] }, (_3) => 1))
27710
- );
27711
- const image_grid_thw_list = image_grid_thw ? image_grid_thw.tolist() : [];
27712
- const video_grid_thw_list = video_grid_thw ? video_grid_thw.tolist() : [];
27713
- let image_index = 0;
27714
- let video_index = 0;
27715
- for (let i = 0; i < total_input_ids.length; ++i) {
27716
- const ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1);
27717
- const vision_start_indices = ids.reduce((acc, x, idx) => {
27718
- if (x == vision_start_token_id) acc.push(idx);
27719
- return acc;
27720
- }, []);
27721
- const vision_tokens = vision_start_indices.map((x) => ids[x + 1]);
27722
- const image_nums = vision_tokens.filter((x) => x == image_token_id).length;
27723
- const video_nums = vision_tokens.filter((x) => x == video_token_id).length;
27724
- let llm_pos_ids_list = [];
27725
- let st2 = 0;
27726
- let remain_images = image_nums;
27727
- let remain_videos = video_nums;
27728
- for (let j = 0; j < vision_tokens.length; ++j) {
27729
- const next_image_token = ids.findIndex((x, i2) => i2 > st2 && x == image_token_id);
27730
- const next_video_token = ids.findIndex((x, i2) => i2 > st2 && x == video_token_id);
27731
- const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1;
27732
- const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1;
27733
- let ed;
27734
- let t, h, w;
27735
- if (ed_image < ed_video) {
27736
- [t, h, w] = image_grid_thw_list[image_index];
27737
- ++image_index;
27738
- --remain_images;
27739
- ed = ed_image;
27740
- } else {
27741
- [t, h, w] = video_grid_thw_list[video_index];
27742
- ++video_index;
27743
- --remain_videos;
27744
- ed = ed_video;
27745
- }
27746
- const [llm_grid_t, llm_grid_h, llm_grid_w] = [
27747
- Number(t),
27748
- Math.floor(Number(h) / spatial_merge_size),
27749
- Math.floor(Number(w) / spatial_merge_size)
27750
- ];
27751
- const text_len = ed - st2;
27752
- const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
27753
- llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i2) => st_idx + i2 % text_len));
27754
- const offset = text_len + st_idx;
27755
- const grid_size = llm_grid_t * llm_grid_h * llm_grid_w;
27756
- const t_index = Array.from(
27757
- { length: grid_size },
27758
- (_, i2) => offset + Math.floor(i2 / (llm_grid_h * llm_grid_w))
27759
- );
27760
- const h_index = Array.from(
27761
- { length: grid_size },
27762
- (_, i2) => offset + Math.floor(i2 / llm_grid_w) % llm_grid_h
27763
- );
27764
- const w_index = Array.from({ length: grid_size }, (_, i2) => offset + i2 % llm_grid_w);
27765
- llm_pos_ids_list.push([t_index, h_index, w_index].flat());
27766
- st2 = ed + grid_size;
27767
- }
27768
- if (st2 < ids.length) {
27769
- const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
27770
- const text_len = ids.length - st2;
27771
- llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i2) => st_idx + i2 % text_len));
27772
- }
27773
- const num_items = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
27774
- const llm_positions = new Array(num_items);
27775
- let index = 0;
27776
- for (let x = 0; x < 3; ++x) {
27777
- for (let y = 0; y < llm_pos_ids_list.length; ++y) {
27778
- const val = llm_pos_ids_list[y];
27779
- const text_len = val.length / 3;
27780
- for (let z = x * text_len; z < (x + 1) * text_len; ++z) {
27781
- llm_positions[index++] = val[z];
27782
- }
27783
- }
27784
- }
27785
- let count2 = 0;
27786
- const attn_mask = attention_mask_list[i];
27787
- for (let y = 0; y < attn_mask.length; ++y) {
27788
- if (attn_mask[y] == 1) {
27789
- for (let x = 0; x < 3; ++x) {
27790
- position_ids_list[x][i][y] = llm_positions[x * num_items / 3 + count2];
27791
- }
27792
- ++count2;
27793
- }
27794
- }
27795
- const max_llm_positions = max(llm_positions)[0];
27796
- mrope_position_deltas.push(max_llm_positions + 1 - total_input_ids[i].length);
27797
- }
27798
- return [
27799
- new Tensor2("int64", position_ids_list.flat(Infinity), [3, input_ids.dims[0], input_ids.dims[1]]),
27800
- new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
27801
- ];
27802
- } else {
27803
- if (attention_mask) {
27804
- const { data, dims } = cumsum_masked_fill(attention_mask);
27805
- const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]);
27806
- const mrope_position_deltas2 = Array.from(
27807
- { length: dims[0] },
27808
- (_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
27809
- );
27810
- return [
27811
- new Tensor2("int64", position_ids, [3, ...dims]),
27812
- new Tensor2("int64", mrope_position_deltas2, [mrope_position_deltas2.length, 1])
27813
- ];
27814
- } else {
27815
- const [batch_size, seq_length] = input_ids.dims;
27816
- const position_ids = BigInt64Array.from(
27817
- { length: 3 * batch_size * seq_length },
27818
- (_, i) => BigInt(Math.floor(i % seq_length / batch_size))
27819
- );
27820
- return [new Tensor2("int64", position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])];
27821
- }
27822
- }
27823
- }
27824
- async encode_image({ pixel_values, image_grid_thw }) {
27825
- const features = (await sessionRun(this.sessions["vision_encoder"], {
27826
- pixel_values,
27827
- [this.image_grid_thw_name]: image_grid_thw
27828
- })).image_features;
27829
- return features;
27830
- }
27831
- _merge_input_ids_with_image_features(kwargs) {
27832
- return default_merge_input_ids_with_image_features({
27833
- // @ts-ignore
27834
- image_token_id: this.config.image_token_id,
27835
- ...kwargs
27836
- });
27837
- }
27838
- prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
27839
- if (model_inputs.attention_mask && !model_inputs.position_ids) {
27840
- if (!model_inputs.past_key_values) {
27841
- [model_inputs.position_ids, model_inputs.rope_deltas] = this.get_rope_index(
27842
- model_inputs.input_ids,
27843
- model_inputs.image_grid_thw,
27844
- model_inputs.video_grid_thw,
27845
- model_inputs.attention_mask
27846
- );
27847
- } else {
27848
- model_inputs.pixel_values = null;
27849
- const past_length = getPastLength(model_inputs.past_key_values);
27850
- if (past_length < model_inputs.input_ids.dims[1]) {
27851
- const [full_position_ids, rope_deltas] = this.get_rope_index(
27852
- model_inputs.input_ids,
27853
- model_inputs.image_grid_thw,
27854
- model_inputs.video_grid_thw,
27855
- model_inputs.attention_mask
27856
- );
27857
- model_inputs.rope_deltas = rope_deltas;
27858
- model_inputs.position_ids = full_position_ids.slice(null, null, [past_length, null]);
27859
- model_inputs.input_ids = model_inputs.input_ids.slice(null, [past_length, null]);
27860
- } else {
27861
- if (!model_inputs.rope_deltas) {
27862
- [, model_inputs.rope_deltas] = this.get_rope_index(
27863
- model_inputs.input_ids,
27864
- model_inputs.image_grid_thw,
27865
- model_inputs.video_grid_thw,
27866
- model_inputs.attention_mask
27867
- );
27868
- }
27869
- const delta = BigInt(past_length);
27870
- const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x);
27871
- model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0);
27872
- }
27873
- }
27874
- }
27875
- return model_inputs;
27876
- }
27877
- };
27878
-
27879
- // src/models/qwen2_5_vl/modeling_qwen2_5_vl.js
27880
- var Qwen2_5_VLForConditionalGeneration = class extends Qwen2VLForConditionalGeneration {
27881
- image_grid_thw_name = "image_grid_thw";
27882
- };
27883
-
27884
29057
  // src/models/qwen3/modeling_qwen3.js
27885
29058
  var Qwen3PreTrainedModel = class extends PreTrainedModel {
27886
29059
  };
@@ -27908,18 +29081,26 @@ var Qwen3NextForCausalLM = class extends Qwen3NextPreTrainedModel {
27908
29081
  // src/models/qwen3_vl/modeling_qwen3_vl.js
27909
29082
  var Qwen3VLForConditionalGeneration = class extends Qwen2_5_VLForConditionalGeneration {
27910
29083
  };
29084
+ var Qwen3VLForCausalLM = class extends Qwen2_5_VLForCausalLM {
29085
+ };
27911
29086
 
27912
29087
  // src/models/qwen3_vl_moe/modeling_qwen3_vl_moe.js
27913
29088
  var Qwen3VLMoeForConditionalGeneration = class extends Qwen3VLForConditionalGeneration {
27914
29089
  };
29090
+ var Qwen3VLMoeForCausalLM = class extends Qwen3VLForCausalLM {
29091
+ };
27915
29092
 
27916
29093
  // src/models/qwen3_5/modeling_qwen3_5.js
27917
29094
  var Qwen3_5ForConditionalGeneration = class extends Qwen3VLForConditionalGeneration {
27918
29095
  };
29096
+ var Qwen3_5ForCausalLM = class extends Qwen3_5ForConditionalGeneration {
29097
+ };
27919
29098
 
27920
29099
  // src/models/qwen3_5_moe/modeling_qwen3_5_moe.js
27921
29100
  var Qwen3_5MoeForConditionalGeneration = class extends Qwen3_5ForConditionalGeneration {
27922
29101
  };
29102
+ var Qwen3_5MoeForCausalLM = class extends Qwen3_5ForCausalLM {
29103
+ };
27923
29104
 
27924
29105
  // src/models/resnet/modeling_resnet.js
27925
29106
  var ResNetPreTrainedModel = class extends PreTrainedModel {
@@ -28318,6 +29499,14 @@ var SnacDecoderModel = class extends SnacPreTrainedModel {
28318
29499
  }
28319
29500
  };
28320
29501
 
29502
+ // src/models/solar_open/modeling_solar_open.js
29503
+ var SolarOpenPreTrainedModel = class extends PreTrainedModel {
29504
+ };
29505
+ var SolarOpenModel = class extends SolarOpenPreTrainedModel {
29506
+ };
29507
+ var SolarOpenForCausalLM = class extends SolarOpenPreTrainedModel {
29508
+ };
29509
+
28321
29510
  // src/models/speecht5/modeling_speecht5.js
28322
29511
  var SpeechT5PreTrainedModel = class extends PreTrainedModel {
28323
29512
  };
@@ -28600,25 +29789,6 @@ var TrOCRPreTrainedModel = class extends PreTrainedModel {
28600
29789
  var TrOCRForCausalLM = class extends TrOCRPreTrainedModel {
28601
29790
  };
28602
29791
 
28603
- // src/models/ultravox/modeling_ultravox.js
28604
- var UltravoxPreTrainedModel = class extends PreTrainedModel {
28605
- forward_params = ["input_ids", "attention_mask", "position_ids", "audio_values", "past_key_values"];
28606
- };
28607
- var UltravoxModel = class extends UltravoxPreTrainedModel {
28608
- _merge_input_ids_with_audio_features(kwargs) {
28609
- const audio_hidden_size = kwargs.audio_features.dims.at(-1);
28610
- const reshaped_audio_features = kwargs.audio_features.view(-1, audio_hidden_size);
28611
- return default_merge_input_ids_with_audio_features({
28612
- // @ts-ignore
28613
- audio_token_id: this.config.ignore_index ?? this.config.audio_token_id,
28614
- ...kwargs,
28615
- audio_features: reshaped_audio_features
28616
- });
28617
- }
28618
- };
28619
- var VoxtralForConditionalGeneration = class extends UltravoxModel {
28620
- };
28621
-
28622
29792
  // src/models/unispeech/modeling_unispeech.js
28623
29793
  var UniSpeechPreTrainedModel = class extends PreTrainedModel {
28624
29794
  };
@@ -28784,6 +29954,170 @@ var VitsModel = class extends VitsPreTrainedModel {
28784
29954
  }
28785
29955
  };
28786
29956
 
29957
+ // src/models/voxtral/modeling_voxtral.js
29958
+ var VoxtralForConditionalGeneration = class extends UltravoxModel {
29959
+ };
29960
+
29961
+ // src/models/voxtral_realtime/modeling_voxtral_realtime.js
29962
+ var CONV1_LEFT_PAD = 2;
29963
+ var CONV2_LEFT_PAD = 1;
29964
+ var states = /* @__PURE__ */ new WeakMap();
29965
+ function createEncoderState(model, input_features) {
29966
+ const { text_config, audio_config } = (
29967
+ /** @type {any} */
29968
+ model.config
29969
+ );
29970
+ const encoder_session = model.sessions["audio_encoder"];
29971
+ const { num_mel_bins, hidden_size: enc_hidden_size } = audio_config;
29972
+ const PADDING_CACHE_CHANNELS = num_mel_bins + enc_hidden_size;
29973
+ const enc_kv_cache = new DynamicCache();
29974
+ const enc_dtype = encoder_session?.config?.kv_cache_dtype ?? "float32";
29975
+ const enc_cls = enc_dtype === "float16" ? DataTypeMap.float16 : DataTypeMap.float32;
29976
+ const enc_shapes = getCacheShapes(audio_config, { batch_size: 1 });
29977
+ for (const name in enc_shapes) {
29978
+ const size = enc_shapes[name].reduce((a, b) => a * b, 1);
29979
+ enc_kv_cache[name] = new Tensor2(enc_dtype, new enc_cls(size), enc_shapes[name]);
29980
+ }
29981
+ const enc_padding_cache = new Tensor2(enc_dtype, new enc_cls(PADDING_CACHE_CHANNELS * CONV1_LEFT_PAD), [
29982
+ 1,
29983
+ PADDING_CACHE_CHANNELS,
29984
+ CONV1_LEFT_PAD
29985
+ ]);
29986
+ const chunks_iter = input_features[Symbol.asyncIterator]?.() ?? input_features[Symbol.iterator]?.();
29987
+ if (!chunks_iter) {
29988
+ throw new Error("input_features must be iterable or async iterable");
29989
+ }
29990
+ return {
29991
+ encoder_session,
29992
+ enc_kv_cache,
29993
+ enc_padding_cache,
29994
+ enc_past_seq_len: 0,
29995
+ audio_embed_queue: [],
29996
+ audio_embed_total_tokens: 0,
29997
+ audio_queue_offset: 0,
29998
+ audio_consumed: 0,
29999
+ stream_exhausted: false,
30000
+ chunks_iter,
30001
+ text_hidden_size: text_config.hidden_size
30002
+ };
30003
+ }
30004
+ async function encodeChunk(s, chunk_features) {
30005
+ const audio_seq_len = chunk_features.dims[2];
30006
+ const conv2_output_len = Math.floor((CONV2_LEFT_PAD + audio_seq_len - 3) / 2) + 1;
30007
+ const position_ids = new Tensor2(
30008
+ "int64",
30009
+ BigInt64Array.from({ length: conv2_output_len }, (_, i) => BigInt(s.enc_past_seq_len + i)),
30010
+ [1, conv2_output_len]
30011
+ );
30012
+ const total_seq_len = s.enc_past_seq_len + conv2_output_len;
30013
+ const attention_mask = ones([1, total_seq_len]);
30014
+ const { audio_embeds, present_padding_cache, ...present_cache } = await sessionRun(s.encoder_session, {
30015
+ input_features: chunk_features,
30016
+ attention_mask,
30017
+ position_ids,
30018
+ past_padding_cache: s.enc_padding_cache,
30019
+ ...s.enc_kv_cache
30020
+ });
30021
+ if (s.enc_padding_cache.location === "gpu-buffer") {
30022
+ s.enc_padding_cache.dispose();
30023
+ }
30024
+ s.enc_padding_cache = present_padding_cache;
30025
+ for (const name in present_cache) {
30026
+ if (name.startsWith("present.")) {
30027
+ const pastName = name.replace("present", "past_key_values");
30028
+ const prev = s.enc_kv_cache[pastName];
30029
+ if (prev?.location === "gpu-buffer") {
30030
+ prev.dispose();
30031
+ }
30032
+ s.enc_kv_cache[pastName] = present_cache[name];
30033
+ }
30034
+ }
30035
+ s.enc_past_seq_len = total_seq_len;
30036
+ return audio_embeds;
30037
+ }
30038
+ async function fillAudioBuffer(s, needed) {
30039
+ while (s.audio_embed_total_tokens < needed && !s.stream_exhausted) {
30040
+ const result = await s.chunks_iter.next();
30041
+ if (result.done) {
30042
+ s.stream_exhausted = true;
30043
+ break;
30044
+ }
30045
+ const new_embeds = await encodeChunk(s, result.value);
30046
+ s.audio_embed_queue.push({ data: new_embeds.data, tokens: new_embeds.dims[1] });
30047
+ s.audio_embed_total_tokens += new_embeds.dims[1];
30048
+ }
30049
+ }
30050
+ function addAudioEmbeddings(s, inputs_embeds, current_len) {
30051
+ if (s.audio_embed_queue.length === 0) return;
30052
+ const embed_data = inputs_embeds.data;
30053
+ let embed_write_pos = 0;
30054
+ let remaining = current_len;
30055
+ while (remaining > 0 && s.audio_embed_queue.length > 0) {
30056
+ const front = s.audio_embed_queue[0];
30057
+ const available = front.tokens - s.audio_queue_offset;
30058
+ const n = Math.min(remaining, available);
30059
+ const src_offset = s.audio_queue_offset * s.text_hidden_size;
30060
+ for (let i = 0; i < n * s.text_hidden_size; ++i) {
30061
+ embed_data[embed_write_pos * s.text_hidden_size + i] += front.data[src_offset + i];
30062
+ }
30063
+ embed_write_pos += n;
30064
+ remaining -= n;
30065
+ s.audio_queue_offset += n;
30066
+ if (s.audio_queue_offset >= front.tokens) {
30067
+ s.audio_embed_queue.shift();
30068
+ s.audio_queue_offset = 0;
30069
+ }
30070
+ }
30071
+ s.audio_consumed += current_len - remaining;
30072
+ }
30073
+ var AudioExhaustedCriteria = class extends StoppingCriteria {
30074
+ constructor(enc_state) {
30075
+ super();
30076
+ this._s = enc_state;
30077
+ }
30078
+ _call(input_ids) {
30079
+ const done = this._s.stream_exhausted && this._s.audio_embed_queue.length === 0;
30080
+ return input_ids.map(() => done);
30081
+ }
30082
+ };
30083
+ var VoxtralRealtimePreTrainedModel = class extends PreTrainedModel {
30084
+ forward_params = ["input_ids", "attention_mask", "position_ids", "past_key_values"];
30085
+ };
30086
+ var VoxtralRealtimeForConditionalGeneration = class extends VoxtralRealtimePreTrainedModel {
30087
+ async forward({ input_ids, past_key_values, ...kwargs }) {
30088
+ const current_len = input_ids.dims[1];
30089
+ const enc = states.get(this);
30090
+ if (enc) {
30091
+ await fillAudioBuffer(enc, enc.audio_consumed + current_len);
30092
+ }
30093
+ const { inputs_embeds } = await sessionRun(this.sessions["embed_tokens"], { input_ids });
30094
+ if (enc) {
30095
+ addAudioEmbeddings(enc, inputs_embeds, current_len);
30096
+ }
30097
+ const decoder_feeds = { inputs_embeds, ...kwargs };
30098
+ this.addPastKeyValues(decoder_feeds, past_key_values);
30099
+ const session = this.sessions["decoder_model_merged"];
30100
+ const fixed = pick(decoder_feeds, session.inputNames);
30101
+ return await sessionRun(session, fixed);
30102
+ }
30103
+ async generate({ input_features, stopping_criteria: userStoppingCriteria, ...kwargs }) {
30104
+ if (!input_features) {
30105
+ throw new Error("input_features (generator/iterable) must be provided");
30106
+ }
30107
+ const enc_state = createEncoderState(this, input_features);
30108
+ states.set(this, enc_state);
30109
+ const stopping_criteria = new StoppingCriteriaList();
30110
+ stopping_criteria.push(new AudioExhaustedCriteria(enc_state));
30111
+ if (userStoppingCriteria) stopping_criteria.extend(userStoppingCriteria);
30112
+ try {
30113
+ return await super.generate({ ...kwargs, stopping_criteria });
30114
+ } finally {
30115
+ enc_state.enc_kv_cache.dispose();
30116
+ states.delete(this);
30117
+ }
30118
+ }
30119
+ };
30120
+
28787
30121
  // src/models/wav2vec2_bert/modeling_wav2vec2_bert.js
28788
30122
  var Wav2Vec2BertPreTrainedModel = class extends PreTrainedModel {
28789
30123
  };
@@ -29289,6 +30623,7 @@ var YoutuForCausalLM = class extends YoutuPreTrainedModel {
29289
30623
  // src/models/registry.js
29290
30624
  var MODEL_MAPPING_NAMES_ENCODER_ONLY = /* @__PURE__ */ new Map([
29291
30625
  ["bert", "BertModel"],
30626
+ ["eurobert", "EuroBertModel"],
29292
30627
  ["neobert", "NeoBertModel"],
29293
30628
  ["modernbert", "ModernBertModel"],
29294
30629
  ["nomic_bert", "NomicBertModel"],
@@ -29420,6 +30755,7 @@ var MODEL_MAPPING_NAMES_DECODER_ONLY = /* @__PURE__ */ new Map([
29420
30755
  ["gemma3_text", "Gemma3Model"],
29421
30756
  ["helium", "HeliumModel"],
29422
30757
  ["glm", "GlmModel"],
30758
+ ["glm_moe_dsa", "GlmMoeDsaModel"],
29423
30759
  ["openelm", "OpenELMModel"],
29424
30760
  ["qwen2", "Qwen2Model"],
29425
30761
  ["qwen2_moe", "Qwen2MoeModel"],
@@ -29431,12 +30767,16 @@ var MODEL_MAPPING_NAMES_DECODER_ONLY = /* @__PURE__ */ new Map([
29431
30767
  ["mpt", "MptModel"],
29432
30768
  ["opt", "OPTModel"],
29433
30769
  ["mistral", "MistralModel"],
30770
+ ["mistral4", "Mistral4Model"],
29434
30771
  ["ministral", "MinistralModel"],
29435
30772
  ["ministral3", "Ministral3Model"],
29436
30773
  ["ernie4_5", "Ernie4_5ForCausalLM"],
29437
30774
  ["starcoder2", "Starcoder2Model"],
30775
+ ["deepseek_v3", "DeepseekV3Model"],
29438
30776
  ["falcon", "FalconModel"],
29439
30777
  ["falcon_h1", "FalconH1Model"],
30778
+ ["nemotron_h", "NemotronHModel"],
30779
+ ["solar_open", "SolarOpenModel"],
29440
30780
  ["stablelm", "StableLmModel"],
29441
30781
  ["modernbert-decoder", "ModernBertDecoderModel"],
29442
30782
  ["hunyuan_v1_dense", "HunYuanDenseV1Model"],
@@ -29456,6 +30796,7 @@ var MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES = /* @__PURE__ */ new Map([
29456
30796
  ]);
29457
30797
  var MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
29458
30798
  ["bert", "BertForSequenceClassification"],
30799
+ ["eurobert", "EuroBertForSequenceClassification"],
29459
30800
  ["neobert", "NeoBertForSequenceClassification"],
29460
30801
  ["modernbert", "ModernBertForSequenceClassification"],
29461
30802
  ["roformer", "RoFormerForSequenceClassification"],
@@ -29478,6 +30819,7 @@ var MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
29478
30819
  ]);
29479
30820
  var MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
29480
30821
  ["bert", "BertForTokenClassification"],
30822
+ ["eurobert", "EuroBertForTokenClassification"],
29481
30823
  ["neobert", "NeoBertForTokenClassification"],
29482
30824
  ["modernbert", "ModernBertForTokenClassification"],
29483
30825
  ["roformer", "RoFormerForTokenClassification"],
@@ -29537,27 +30879,40 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
29537
30879
  ["gemma2", "Gemma2ForCausalLM"],
29538
30880
  ["vaultgemma", "VaultGemmaForCausalLM"],
29539
30881
  ["gemma3_text", "Gemma3ForCausalLM"],
30882
+ ["gemma3", "Gemma3ForCausalLM"],
29540
30883
  ["helium", "HeliumForCausalLM"],
29541
30884
  ["glm", "GlmForCausalLM"],
30885
+ ["glm_moe_dsa", "GlmMoeDsaForCausalLM"],
29542
30886
  ["openelm", "OpenELMForCausalLM"],
29543
30887
  ["qwen2", "Qwen2ForCausalLM"],
29544
30888
  ["qwen2_moe", "Qwen2MoeForCausalLM"],
29545
30889
  ["qwen3", "Qwen3ForCausalLM"],
29546
30890
  ["qwen3_moe", "Qwen3MoeForCausalLM"],
29547
30891
  ["qwen3_next", "Qwen3NextForCausalLM"],
30892
+ ["qwen2_vl", "Qwen2VLForCausalLM"],
30893
+ ["qwen2_5_vl", "Qwen2_5_VLForCausalLM"],
30894
+ ["qwen3_vl", "Qwen3VLForCausalLM"],
30895
+ ["qwen3_vl_moe", "Qwen3VLMoeForCausalLM"],
30896
+ ["qwen3_5", "Qwen3_5ForCausalLM"],
30897
+ ["qwen3_5_moe", "Qwen3_5MoeForCausalLM"],
30898
+ ["gemma3n", "Gemma3nForCausalLM"],
29548
30899
  ["phi", "PhiForCausalLM"],
29549
30900
  ["phi3", "Phi3ForCausalLM"],
29550
30901
  ["mpt", "MptForCausalLM"],
29551
30902
  ["opt", "OPTForCausalLM"],
29552
30903
  ["mbart", "MBartForCausalLM"],
29553
30904
  ["mistral", "MistralForCausalLM"],
30905
+ ["mistral4", "Mistral4ForCausalLM"],
29554
30906
  ["ministral", "MinistralForCausalLM"],
29555
30907
  ["ministral3", "Ministral3ForCausalLM"],
29556
30908
  ["ernie4_5", "Ernie4_5ForCausalLM"],
29557
30909
  ["starcoder2", "Starcoder2ForCausalLM"],
30910
+ ["deepseek_v3", "DeepseekV3ForCausalLM"],
29558
30911
  ["falcon", "FalconForCausalLM"],
29559
30912
  ["falcon_h1", "FalconH1ForCausalLM"],
30913
+ ["nemotron_h", "NemotronHForCausalLM"],
29560
30914
  ["trocr", "TrOCRForCausalLM"],
30915
+ ["solar_open", "SolarOpenForCausalLM"],
29561
30916
  ["stablelm", "StableLmForCausalLM"],
29562
30917
  ["modernbert-decoder", "ModernBertDecoderForCausalLM"],
29563
30918
  ["hunyuan_v1_dense", "HunYuanDenseV1ForCausalLM"],
@@ -29568,6 +30923,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
29568
30923
  var MODEL_FOR_MULTIMODALITY_MAPPING_NAMES = /* @__PURE__ */ new Map([["multi_modality", "MultiModalityCausalLM"]]);
29569
30924
  var MODEL_FOR_MASKED_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
29570
30925
  ["bert", "BertForMaskedLM"],
30926
+ ["eurobert", "EuroBertForMaskedLM"],
29571
30927
  ["neobert", "NeoBertForMaskedLM"],
29572
30928
  ["modernbert", "ModernBertForMaskedLM"],
29573
30929
  ["roformer", "RoFormerForMaskedLM"],
@@ -29620,16 +30976,21 @@ var MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
29620
30976
  ["qwen3_vl_moe", "Qwen3VLMoeForConditionalGeneration"],
29621
30977
  ["qwen3_5", "Qwen3_5ForConditionalGeneration"],
29622
30978
  ["qwen3_5_moe", "Qwen3_5MoeForConditionalGeneration"],
30979
+ ["lfm2_vl", "Lfm2VlForConditionalGeneration"],
29623
30980
  ["idefics3", "Idefics3ForConditionalGeneration"],
29624
30981
  ["smolvlm", "SmolVLMForConditionalGeneration"],
29625
30982
  ["paligemma", "PaliGemmaForConditionalGeneration"],
29626
30983
  ["llava_qwen2", "LlavaQwen2ForCausalLM"],
29627
30984
  ["gemma3n", "Gemma3nForConditionalGeneration"],
29628
- ["mistral3", "Mistral3ForConditionalGeneration"]
30985
+ ["mistral3", "Mistral3ForConditionalGeneration"],
30986
+ ["lighton_ocr", "LightOnOcrForConditionalGeneration"],
30987
+ ["glm_ocr", "GlmOcrForConditionalGeneration"]
29629
30988
  ]);
29630
30989
  var MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
30990
+ ["granite_speech", "GraniteSpeechForConditionalGeneration"],
29631
30991
  ["ultravox", "UltravoxModel"],
29632
- ["voxtral", "VoxtralForConditionalGeneration"]
30992
+ ["voxtral", "VoxtralForConditionalGeneration"],
30993
+ ["voxtral_realtime", "VoxtralRealtimeForConditionalGeneration"]
29633
30994
  ]);
29634
30995
  var MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = /* @__PURE__ */ new Map([
29635
30996
  ["vision-encoder-decoder", "VisionEncoderDecoderModel"]
@@ -29728,6 +31089,7 @@ var MODEL_FOR_TIME_SERIES_PREDICTION_MAPPING_NAMES = /* @__PURE__ */ new Map([
29728
31089
  ]);
29729
31090
  var MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES = /* @__PURE__ */ new Map([["swin2sr", "Swin2SRForImageSuperResolution"]]);
29730
31091
  var MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
31092
+ ["chmv2", "CHMv2ForDepthEstimation"],
29731
31093
  ["dpt", "DPTForDepthEstimation"],
29732
31094
  ["depth_anything", "DepthAnythingForDepthEstimation"],
29733
31095
  ["glpn", "GLPNForDepthEstimation"],
@@ -29812,7 +31174,19 @@ var CUSTOM_MAPPING = [
29812
31174
  MODEL_TYPES.ImageAudioTextToText
29813
31175
  ],
29814
31176
  ["SupertonicForConditionalGeneration", SupertonicForConditionalGeneration, MODEL_TYPES.Supertonic],
29815
- ["ChatterboxModel", ChatterboxModel, MODEL_TYPES.Chatterbox]
31177
+ ["ChatterboxModel", ChatterboxModel, MODEL_TYPES.Chatterbox],
31178
+ ["Qwen2VLForCausalLM", Qwen2VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
31179
+ ["Qwen2_5_VLForCausalLM", Qwen2_5_VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
31180
+ ["Qwen3VLForCausalLM", Qwen3VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
31181
+ ["Qwen3VLMoeForCausalLM", Qwen3VLMoeForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
31182
+ ["Qwen3_5ForCausalLM", Qwen3_5ForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
31183
+ ["Qwen3_5MoeForCausalLM", Qwen3_5MoeForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
31184
+ ["Gemma3nForCausalLM", Gemma3nForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
31185
+ [
31186
+ "VoxtralRealtimeForConditionalGeneration",
31187
+ VoxtralRealtimeForConditionalGeneration,
31188
+ MODEL_TYPES.VoxtralRealtime
31189
+ ]
29816
31190
  ];
29817
31191
  for (const [name, model, type] of CUSTOM_MAPPING) {
29818
31192
  MODEL_TYPE_MAPPING.set(name, type);
@@ -31490,8 +32864,18 @@ var TASK_ALIASES = Object.freeze({
31490
32864
  });
31491
32865
 
31492
32866
  // src/utils/model_registry/get_model_files.js
32867
+ function get_config(modelId, { config = null, cache_dir = null, local_files_only = false, revision = "main" } = {}) {
32868
+ if (config !== null) {
32869
+ return AutoConfig.from_pretrained(modelId, { config, cache_dir, local_files_only, revision });
32870
+ }
32871
+ const key = JSON.stringify([modelId, cache_dir, local_files_only, revision]);
32872
+ return memoizePromise(
32873
+ key,
32874
+ () => AutoConfig.from_pretrained(modelId, { config, cache_dir, local_files_only, revision })
32875
+ );
32876
+ }
31493
32877
  async function get_model_files(modelId, { config = null, dtype: overrideDtype = null, device: overrideDevice = null, model_file_name = null } = {}) {
31494
- config = await AutoConfig.from_pretrained(modelId, { config });
32878
+ config = await get_config(modelId, { config });
31495
32879
  const files = [
31496
32880
  // Add config.json (always loaded)
31497
32881
  "config.json"
@@ -31552,74 +32936,14 @@ async function get_model_files(modelId, { config = null, dtype: overrideDtype =
31552
32936
  files.push(dataFilePath);
31553
32937
  }
31554
32938
  };
31555
- const singleModelName = model_file_name ?? "model";
31556
- if (modelType === MODEL_TYPES.DecoderOnly) {
31557
- add_model_file("model", singleModelName);
31558
- files.push("generation_config.json");
31559
- } else if (modelType === MODEL_TYPES.DecoderOnlyWithoutHead) {
31560
- add_model_file("model", singleModelName);
31561
- } else if (modelType === MODEL_TYPES.Seq2Seq || modelType === MODEL_TYPES.Vision2Seq) {
31562
- add_model_file("model", "encoder_model");
31563
- add_model_file("decoder_model_merged");
31564
- files.push("generation_config.json");
31565
- } else if (modelType === MODEL_TYPES.MaskGeneration) {
31566
- add_model_file("model", "vision_encoder");
31567
- add_model_file("prompt_encoder_mask_decoder");
31568
- } else if (modelType === MODEL_TYPES.EncoderDecoder) {
31569
- add_model_file("model", "encoder_model");
31570
- add_model_file("decoder_model_merged");
31571
- } else if (modelType === MODEL_TYPES.ImageTextToText) {
31572
- add_model_file("embed_tokens");
31573
- add_model_file("vision_encoder");
31574
- add_model_file("decoder_model_merged");
31575
- if (config.is_encoder_decoder) {
31576
- add_model_file("model", "encoder_model");
31577
- }
31578
- files.push("generation_config.json");
31579
- } else if (modelType === MODEL_TYPES.AudioTextToText) {
31580
- add_model_file("embed_tokens");
31581
- add_model_file("audio_encoder");
31582
- add_model_file("decoder_model_merged");
31583
- files.push("generation_config.json");
31584
- } else if (modelType === MODEL_TYPES.ImageAudioTextToText) {
31585
- add_model_file("embed_tokens");
31586
- add_model_file("audio_encoder");
31587
- add_model_file("vision_encoder");
31588
- add_model_file("decoder_model_merged");
31589
- files.push("generation_config.json");
31590
- } else if (modelType === MODEL_TYPES.Musicgen) {
31591
- add_model_file("model", "text_encoder");
31592
- add_model_file("decoder_model_merged");
31593
- add_model_file("encodec_decode");
31594
- files.push("generation_config.json");
31595
- } else if (modelType === MODEL_TYPES.MultiModality) {
31596
- add_model_file("prepare_inputs_embeds");
31597
- add_model_file("model", "language_model");
31598
- add_model_file("lm_head");
31599
- add_model_file("gen_head");
31600
- add_model_file("gen_img_embeds");
31601
- add_model_file("image_decode");
31602
- files.push("generation_config.json");
31603
- } else if (modelType === MODEL_TYPES.Phi3V) {
31604
- add_model_file("prepare_inputs_embeds");
31605
- add_model_file("model");
31606
- add_model_file("vision_encoder");
31607
- files.push("generation_config.json");
31608
- } else if (modelType === MODEL_TYPES.Chatterbox) {
31609
- add_model_file("embed_tokens");
31610
- add_model_file("speech_encoder");
31611
- add_model_file("model", "language_model");
31612
- add_model_file("conditional_decoder");
31613
- files.push("generation_config.json");
31614
- } else if (modelType === MODEL_TYPES.AutoEncoder) {
31615
- add_model_file("encoder_model");
31616
- add_model_file("decoder_model");
31617
- } else if (modelType === MODEL_TYPES.Supertonic) {
31618
- add_model_file("text_encoder");
31619
- add_model_file("latent_denoiser");
31620
- add_model_file("voice_decoder");
31621
- } else {
31622
- add_model_file("model", singleModelName);
32939
+ const { sessions, optional_configs } = getSessionsConfig(modelType, config, { model_file_name });
32940
+ for (const [sessionKey, baseName] of Object.entries(sessions)) {
32941
+ add_model_file(sessionKey, baseName);
32942
+ }
32943
+ if (optional_configs) {
32944
+ for (const configFile of Object.values(optional_configs)) {
32945
+ files.push(configFile);
32946
+ }
31623
32947
  }
31624
32948
  return files;
31625
32949
  }
@@ -32070,25 +33394,25 @@ async function load_video(src, { num_frames = null, fps = null } = {}) {
32070
33394
 
32071
33395
  // src/utils/model_registry/is_cached.js
32072
33396
  async function check_files_cache(modelId, files, options = {}) {
32073
- const cache = await getCache(options?.cache_dir);
32074
- if (!cache) {
33397
+ const cache2 = await getCache(options?.cache_dir);
33398
+ if (!cache2) {
32075
33399
  const fileStatuses2 = files.map((filename) => ({ file: filename, cached: false }));
32076
33400
  return { allCached: false, files: fileStatuses2 };
32077
33401
  }
32078
33402
  const fileStatuses = await Promise.all(
32079
33403
  files.map(async (filename) => {
32080
- const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options, cache);
32081
- const cached = await checkCachedResource(cache, localPath, proposedCacheKey);
33404
+ const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options, cache2);
33405
+ const cached = await checkCachedResource(cache2, localPath, proposedCacheKey);
32082
33406
  return { file: filename, cached: !!cached };
32083
33407
  })
32084
33408
  );
32085
33409
  return { allCached: fileStatuses.every((f) => f.cached), files: fileStatuses };
32086
33410
  }
32087
33411
  async function is_file_cached(modelId, filename, options = {}) {
32088
- const cache = await getCache(options?.cache_dir);
32089
- if (!cache) return false;
32090
- const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options, cache);
32091
- return !!await checkCachedResource(cache, localPath, proposedCacheKey);
33412
+ const cache2 = await getCache(options?.cache_dir);
33413
+ if (!cache2) return false;
33414
+ const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options, cache2);
33415
+ return !!await checkCachedResource(cache2, localPath, proposedCacheKey);
32092
33416
  }
32093
33417
  async function is_cached(modelId, options = {}) {
32094
33418
  if (!modelId) {
@@ -32135,26 +33459,26 @@ async function is_pipeline_cached_files(task, modelId, options = {}) {
32135
33459
 
32136
33460
  // src/utils/model_registry/clear_cache.js
32137
33461
  async function clear_files_from_cache(modelId, files, options = {}) {
32138
- const cache = await getCache(options?.cache_dir);
32139
- if (!cache) {
33462
+ const cache2 = await getCache(options?.cache_dir);
33463
+ if (!cache2) {
32140
33464
  return {
32141
33465
  filesDeleted: 0,
32142
33466
  filesCached: 0,
32143
33467
  files: files.map((filename) => ({ file: filename, deleted: false, wasCached: false }))
32144
33468
  };
32145
33469
  }
32146
- if (!cache.delete) {
33470
+ if (!cache2.delete) {
32147
33471
  throw new Error("Cache does not support delete operation");
32148
33472
  }
32149
33473
  const results = await Promise.all(
32150
33474
  files.map(async (filename) => {
32151
- const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options, cache);
32152
- const cached = await checkCachedResource(cache, localPath, proposedCacheKey);
33475
+ const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options, cache2);
33476
+ const cached = await checkCachedResource(cache2, localPath, proposedCacheKey);
32153
33477
  const wasCached = !!cached;
32154
33478
  let deleted = false;
32155
33479
  if (wasCached) {
32156
- const deletedWithProposed = await cache.delete(proposedCacheKey);
32157
- const deletedWithLocal = !deletedWithProposed && proposedCacheKey !== localPath ? await cache.delete(localPath) : false;
33480
+ const deletedWithProposed = await cache2.delete(proposedCacheKey);
33481
+ const deletedWithLocal = !deletedWithProposed && proposedCacheKey !== localPath ? await cache2.delete(localPath) : false;
32158
33482
  deleted = deletedWithProposed || deletedWithLocal;
32159
33483
  }
32160
33484
  return { file: filename, deleted, wasCached };
@@ -32505,6 +33829,9 @@ var ModelRegistry = class {
32505
33829
  BloomModel,
32506
33830
  BloomPreTrainedModel,
32507
33831
  BloomTokenizer,
33832
+ CHMv2ForDepthEstimation,
33833
+ CHMv2ImageProcessor,
33834
+ CHMv2PreTrainedModel,
32508
33835
  CLIPFeatureExtractor,
32509
33836
  CLIPImageProcessor,
32510
33837
  CLIPModel,
@@ -32600,6 +33927,9 @@ var ModelRegistry = class {
32600
33927
  DebertaV2Tokenizer,
32601
33928
  DecisionTransformerModel,
32602
33929
  DecisionTransformerPreTrainedModel,
33930
+ DeepseekV3ForCausalLM,
33931
+ DeepseekV3Model,
33932
+ DeepseekV3PreTrainedModel,
32603
33933
  DeiTFeatureExtractor,
32604
33934
  DeiTForImageClassification,
32605
33935
  DeiTImageProcessor,
@@ -32636,6 +33966,7 @@ var ModelRegistry = class {
32636
33966
  DonutImageProcessor,
32637
33967
  DonutSwinModel,
32638
33968
  DonutSwinPreTrainedModel,
33969
+ DynamicCache,
32639
33970
  EdgeTamModel,
32640
33971
  EfficientNetForImageClassification,
32641
33972
  EfficientNetImageProcessor,
@@ -32659,6 +33990,11 @@ var ModelRegistry = class {
32659
33990
  EsmModel,
32660
33991
  EsmPreTrainedModel,
32661
33992
  EsmTokenizer,
33993
+ EuroBertForMaskedLM,
33994
+ EuroBertForSequenceClassification,
33995
+ EuroBertForTokenClassification,
33996
+ EuroBertModel,
33997
+ EuroBertPreTrainedModel,
32662
33998
  ExaoneForCausalLM,
32663
33999
  ExaoneModel,
32664
34000
  ExaonePreTrainedModel,
@@ -32708,6 +34044,7 @@ var ModelRegistry = class {
32708
34044
  Gemma3Model,
32709
34045
  Gemma3PreTrainedModel,
32710
34046
  Gemma3nAudioFeatureExtractor,
34047
+ Gemma3nForCausalLM,
32711
34048
  Gemma3nForConditionalGeneration,
32712
34049
  Gemma3nPreTrainedModel,
32713
34050
  Gemma3nProcessor,
@@ -32715,8 +34052,14 @@ var ModelRegistry = class {
32715
34052
  GemmaModel,
32716
34053
  GemmaPreTrainedModel,
32717
34054
  GemmaTokenizer,
34055
+ Glm46VImageProcessor,
34056
+ Glm46VProcessor,
32718
34057
  GlmForCausalLM,
32719
34058
  GlmModel,
34059
+ GlmMoeDsaForCausalLM,
34060
+ GlmMoeDsaModel,
34061
+ GlmMoeDsaPreTrainedModel,
34062
+ GlmOcrForConditionalGeneration,
32720
34063
  GlmPreTrainedModel,
32721
34064
  GptOssForCausalLM,
32722
34065
  GptOssModel,
@@ -32727,6 +34070,9 @@ var ModelRegistry = class {
32727
34070
  GraniteMoeHybridModel,
32728
34071
  GraniteMoeHybridPreTrainedModel,
32729
34072
  GranitePreTrainedModel,
34073
+ GraniteSpeechFeatureExtractor,
34074
+ GraniteSpeechForConditionalGeneration,
34075
+ GraniteSpeechProcessor,
32730
34076
  GroundingDinoForObjectDetection,
32731
34077
  GroundingDinoImageProcessor,
32732
34078
  GroundingDinoPreTrainedModel,
@@ -32752,7 +34098,6 @@ var ModelRegistry = class {
32752
34098
  IJepaPreTrainedModel,
32753
34099
  Idefics3ForConditionalGeneration,
32754
34100
  Idefics3ImageProcessor,
32755
- Idefics3PreTrainedModel,
32756
34101
  Idefics3Processor,
32757
34102
  ImageClassificationPipeline,
32758
34103
  ImageFeatureExtractionPipeline,
@@ -32777,6 +34122,10 @@ var ModelRegistry = class {
32777
34122
  Lfm2MoeModel,
32778
34123
  Lfm2MoePreTrainedModel,
32779
34124
  Lfm2PreTrainedModel,
34125
+ Lfm2VlForConditionalGeneration,
34126
+ Lfm2VlImageProcessor,
34127
+ Lfm2VlProcessor,
34128
+ LightOnOcrForConditionalGeneration,
32780
34129
  LiteWhisperForConditionalGeneration,
32781
34130
  Llama4ForCausalLM,
32782
34131
  Llama4PreTrainedModel,
@@ -32846,6 +34195,9 @@ var ModelRegistry = class {
32846
34195
  MimiPreTrainedModel,
32847
34196
  MinLengthLogitsProcessor,
32848
34197
  MinNewTokensLengthLogitsProcessor,
34198
+ Mistral4ForCausalLM,
34199
+ Mistral4Model,
34200
+ Mistral4PreTrainedModel,
32849
34201
  MistralForCausalLM,
32850
34202
  MistralModel,
32851
34203
  MistralPreTrainedModel,
@@ -32917,6 +34269,9 @@ var ModelRegistry = class {
32917
34269
  NanoChatForCausalLM,
32918
34270
  NanoChatModel,
32919
34271
  NanoChatPreTrainedModel,
34272
+ NemotronHForCausalLM,
34273
+ NemotronHModel,
34274
+ NemotronHPreTrainedModel,
32920
34275
  NeoBertForMaskedLM,
32921
34276
  NeoBertForQuestionAnswering,
32922
34277
  NeoBertForSequenceClassification,
@@ -32960,7 +34315,6 @@ var ModelRegistry = class {
32960
34315
  Owlv2Model,
32961
34316
  Owlv2PreTrainedModel,
32962
34317
  PaliGemmaForConditionalGeneration,
32963
- PaliGemmaPreTrainedModel,
32964
34318
  PaliGemmaProcessor,
32965
34319
  ParakeetFeatureExtractor,
32966
34320
  ParakeetForCTC,
@@ -33004,10 +34358,12 @@ var ModelRegistry = class {
33004
34358
  Qwen2MoePreTrainedModel,
33005
34359
  Qwen2PreTrainedModel,
33006
34360
  Qwen2Tokenizer,
34361
+ Qwen2VLForCausalLM,
33007
34362
  Qwen2VLForConditionalGeneration,
33008
34363
  Qwen2VLImageProcessor,
33009
34364
  Qwen2VLPreTrainedModel,
33010
34365
  Qwen2VLProcessor,
34366
+ Qwen2_5_VLForCausalLM,
33011
34367
  Qwen2_5_VLForConditionalGeneration,
33012
34368
  Qwen2_5_VLProcessor,
33013
34369
  Qwen3ForCausalLM,
@@ -33019,10 +34375,14 @@ var ModelRegistry = class {
33019
34375
  Qwen3NextModel,
33020
34376
  Qwen3NextPreTrainedModel,
33021
34377
  Qwen3PreTrainedModel,
34378
+ Qwen3VLForCausalLM,
33022
34379
  Qwen3VLForConditionalGeneration,
34380
+ Qwen3VLMoeForCausalLM,
33023
34381
  Qwen3VLMoeForConditionalGeneration,
33024
34382
  Qwen3VLProcessor,
34383
+ Qwen3_5ForCausalLM,
33025
34384
  Qwen3_5ForConditionalGeneration,
34385
+ Qwen3_5MoeForCausalLM,
33026
34386
  Qwen3_5MoeForConditionalGeneration,
33027
34387
  RFDetrForObjectDetection,
33028
34388
  RFDetrModel,
@@ -33094,7 +34454,6 @@ var ModelRegistry = class {
33094
34454
  SmolLM3ForCausalLM,
33095
34455
  SmolLM3Model,
33096
34456
  SmolLM3PreTrainedModel,
33097
- SmolVLMForConditionalGeneration,
33098
34457
  SmolVLMImageProcessor,
33099
34458
  SmolVLMProcessor,
33100
34459
  SnacDecoderModel,
@@ -33102,6 +34461,9 @@ var ModelRegistry = class {
33102
34461
  SnacFeatureExtractor,
33103
34462
  SnacModel,
33104
34463
  SnacPreTrainedModel,
34464
+ SolarOpenForCausalLM,
34465
+ SolarOpenModel,
34466
+ SolarOpenPreTrainedModel,
33105
34467
  SpeechT5FeatureExtractor,
33106
34468
  SpeechT5ForSpeechToText,
33107
34469
  SpeechT5ForTextToSpeech,
@@ -33200,6 +34562,10 @@ var ModelRegistry = class {
33200
34562
  VitsTokenizer,
33201
34563
  VoxtralForConditionalGeneration,
33202
34564
  VoxtralProcessor,
34565
+ VoxtralRealtimeFeatureExtractor,
34566
+ VoxtralRealtimeForConditionalGeneration,
34567
+ VoxtralRealtimePreTrainedModel,
34568
+ VoxtralRealtimeProcessor,
33203
34569
  Wav2Vec2BertForCTC,
33204
34570
  Wav2Vec2BertForSequenceClassification,
33205
34571
  Wav2Vec2BertModel,
@@ -33295,7 +34661,7 @@ var ModelRegistry = class {
33295
34661
 
33296
34662
  onnxruntime-web/dist/ort.webgpu.bundle.min.mjs:
33297
34663
  (*!
33298
- * ONNX Runtime Web v1.25.0-dev.20260303-e7e64dc112
34664
+ * ONNX Runtime Web v1.25.0-dev.20260307-d626b568e0
33299
34665
  * Copyright (c) Microsoft Corporation. All rights reserved.
33300
34666
  * Licensed under the MIT License.
33301
34667
  *)