@huggingface/transformers 4.0.0-next.0 → 4.0.0-next.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (364) hide show
  1. package/README.md +32 -6
  2. package/dist/ort-wasm-simd-threaded.jsep.mjs +31 -31
  3. package/dist/transformers.js +9261 -1709
  4. package/dist/transformers.min.js +25 -18
  5. package/dist/transformers.node.cjs +6119 -3174
  6. package/dist/transformers.node.min.cjs +25 -23
  7. package/dist/transformers.node.min.mjs +25 -23
  8. package/dist/transformers.node.mjs +6034 -3168
  9. package/dist/transformers.web.js +4255 -1381
  10. package/dist/transformers.web.min.js +23 -19
  11. package/package.json +6 -6
  12. package/src/backends/onnx.js +128 -53
  13. package/src/backends/utils/cacheWasm.js +28 -46
  14. package/src/cache_utils.js +62 -0
  15. package/src/configs.js +123 -23
  16. package/src/env.js +100 -11
  17. package/src/generation/logits_sampler.js +3 -15
  18. package/src/generation/parameters.js +1 -1
  19. package/src/generation/streamers.js +21 -0
  20. package/src/image_processors_utils.js +29 -23
  21. package/src/models/afmoe/modeling_afmoe.js +5 -0
  22. package/src/models/auto/image_processing_auto.js +2 -1
  23. package/src/models/auto/modeling_auto.js +16 -2
  24. package/src/models/auto/tokenization_auto.js +2 -1
  25. package/src/models/chatterbox/modeling_chatterbox.js +1 -1
  26. package/src/models/chmv2/image_processing_chmv2.js +3 -0
  27. package/src/models/chmv2/modeling_chmv2.js +4 -0
  28. package/src/models/clap/feature_extraction_clap.js +2 -1
  29. package/src/models/cohere2/modeling_cohere2.js +5 -0
  30. package/src/models/cohere_asr/feature_extraction_cohere_asr.js +117 -0
  31. package/src/models/cohere_asr/modeling_cohere_asr.js +11 -0
  32. package/src/models/cohere_asr/processing_cohere_asr.js +55 -0
  33. package/src/models/cohere_asr/tokenization_cohere_asr.js +3 -0
  34. package/src/models/deepseek_v3/modeling_deepseek_v3.js +5 -0
  35. package/src/models/detr/image_processing_detr.js +1 -1
  36. package/src/models/eurobert/modeling_eurobert.js +41 -0
  37. package/src/models/feature_extractors.js +3 -0
  38. package/src/models/gemma3/image_processing_gemma3.js +3 -0
  39. package/src/models/gemma3/modeling_gemma3.js +4 -1
  40. package/src/models/gemma3/processing_gemma3.js +45 -0
  41. package/src/models/gemma3n/modeling_gemma3n.js +2 -0
  42. package/src/models/glm46v/image_processing_glm46v.js +12 -0
  43. package/src/models/glm46v/processing_glm46v.js +5 -0
  44. package/src/models/glm_moe_dsa/modeling_glm_moe_dsa.js +5 -0
  45. package/src/models/glm_ocr/modeling_glm_ocr.js +78 -0
  46. package/src/models/granite_speech/feature_extraction_granite_speech.js +58 -0
  47. package/src/models/granite_speech/modeling_granite_speech.js +5 -0
  48. package/src/models/granite_speech/processing_granite_speech.js +62 -0
  49. package/src/models/grounding_dino/image_processing_grounding_dino.js +1 -1
  50. package/src/models/idefics3/modeling_idefics3.js +5 -32
  51. package/src/models/image_processors.js +4 -0
  52. package/src/models/lfm2_vl/image_processing_lfm2_vl.js +305 -0
  53. package/src/models/lfm2_vl/modeling_lfm2_vl.js +13 -0
  54. package/src/models/lfm2_vl/processing_lfm2_vl.js +77 -0
  55. package/src/models/lighton_ocr/modeling_lighton_ocr.js +3 -0
  56. package/src/models/llava/modeling_llava.js +1 -1
  57. package/src/models/marian/tokenization_marian.js +3 -2
  58. package/src/models/mistral3/modeling_mistral3.js +2 -2
  59. package/src/models/mistral4/modeling_mistral4.js +5 -0
  60. package/src/models/modeling_utils.js +283 -300
  61. package/src/models/models.js +26 -1
  62. package/src/models/nemotron_h/modeling_nemotron_h.js +5 -0
  63. package/src/models/olmo_hybrid/modeling_olmo_hybrid.js +5 -0
  64. package/src/models/paligemma/modeling_paligemma.js +2 -25
  65. package/src/models/paligemma/processing_paligemma.js +3 -2
  66. package/src/models/processors.js +8 -0
  67. package/src/models/qwen2_5_vl/modeling_qwen2_5_vl.js +9 -0
  68. package/src/models/qwen2_5_vl/processing_qwen2_5_vl.js +3 -0
  69. package/src/models/qwen2_moe/modeling_qwen2_moe.js +5 -0
  70. package/src/models/qwen2_vl/image_processing_qwen2_vl.js +15 -1
  71. package/src/models/qwen2_vl/modeling_qwen2_vl.js +240 -143
  72. package/src/models/qwen2_vl/processing_qwen2_vl.js +5 -4
  73. package/src/models/qwen3_5/modeling_qwen3_5.js +4 -0
  74. package/src/models/qwen3_5_moe/modeling_qwen3_5_moe.js +4 -0
  75. package/src/models/qwen3_moe/modeling_qwen3_moe.js +5 -0
  76. package/src/models/qwen3_next/modeling_qwen3_next.js +5 -0
  77. package/src/models/qwen3_vl/modeling_qwen3_vl.js +4 -0
  78. package/src/models/qwen3_vl/processing_qwen3_vl.js +3 -0
  79. package/src/models/qwen3_vl_moe/modeling_qwen3_vl_moe.js +4 -0
  80. package/src/models/registry.js +61 -5
  81. package/src/models/sam/image_processing_sam.js +1 -1
  82. package/src/models/session.js +33 -56
  83. package/src/models/smolvlm/modeling_smolvlm.js +7 -0
  84. package/src/models/solar_open/modeling_solar_open.js +5 -0
  85. package/src/models/tokenizers.js +1 -0
  86. package/src/models/ultravox/modeling_ultravox.js +1 -3
  87. package/src/models/voxtral/modeling_voxtral.js +3 -0
  88. package/src/models/voxtral_realtime/feature_extraction_voxtral_realtime.js +71 -0
  89. package/src/models/voxtral_realtime/modeling_voxtral_realtime.js +239 -0
  90. package/src/models/voxtral_realtime/processing_voxtral_realtime.js +113 -0
  91. package/src/models/whisper/feature_extraction_whisper.js +4 -13
  92. package/src/models/whisper/modeling_whisper.js +6 -5
  93. package/src/models/xlm/tokenization_xlm.js +2 -1
  94. package/src/pipelines/automatic-speech-recognition.js +47 -3
  95. package/src/pipelines/document-question-answering.js +1 -1
  96. package/src/pipelines/image-to-text.js +2 -2
  97. package/src/pipelines/index.js +313 -0
  98. package/src/pipelines/summarization.js +1 -1
  99. package/src/pipelines/text-generation.js +5 -1
  100. package/src/pipelines/text-to-audio.js +4 -2
  101. package/src/pipelines/text2text-generation.js +1 -1
  102. package/src/pipelines/translation.js +1 -1
  103. package/src/pipelines/zero-shot-classification.js +3 -2
  104. package/src/pipelines.js +140 -428
  105. package/src/tokenization_utils.js +42 -21
  106. package/src/transformers.js +10 -1
  107. package/src/utils/audio.js +20 -3
  108. package/src/utils/cache/CrossOriginStorageCache.js +251 -0
  109. package/src/utils/cache/FileCache.js +128 -0
  110. package/src/utils/cache/cross-origin-storage.d.ts +38 -0
  111. package/src/utils/cache.js +12 -4
  112. package/src/utils/core.js +23 -1
  113. package/src/utils/devices.js +22 -0
  114. package/src/utils/dtypes.js +55 -0
  115. package/src/utils/hub/{files.js → FileResponse.js} +0 -90
  116. package/src/utils/hub/utils.js +45 -5
  117. package/src/utils/hub.js +67 -23
  118. package/src/utils/image.js +14 -14
  119. package/src/utils/logger.js +67 -0
  120. package/src/utils/lru_cache.js +67 -0
  121. package/src/utils/memoize_promise.js +45 -0
  122. package/src/utils/model-loader.js +35 -17
  123. package/src/utils/model_registry/ModelRegistry.js +382 -0
  124. package/src/utils/model_registry/clear_cache.js +128 -0
  125. package/src/utils/model_registry/get_available_dtypes.js +68 -0
  126. package/src/utils/model_registry/get_file_metadata.js +162 -0
  127. package/src/utils/model_registry/get_files.js +42 -0
  128. package/src/utils/model_registry/get_model_files.js +114 -0
  129. package/src/utils/model_registry/get_pipeline_files.js +44 -0
  130. package/src/utils/model_registry/get_processor_files.js +20 -0
  131. package/src/utils/model_registry/get_tokenizer_files.js +21 -0
  132. package/src/utils/model_registry/is_cached.js +169 -0
  133. package/src/utils/model_registry/resolve_model_type.js +66 -0
  134. package/src/utils/random.js +225 -0
  135. package/src/utils/tensor.js +26 -23
  136. package/src/utils/video.js +2 -2
  137. package/types/backends/onnx.d.ts.map +1 -1
  138. package/types/backends/utils/cacheWasm.d.ts +3 -17
  139. package/types/backends/utils/cacheWasm.d.ts.map +1 -1
  140. package/types/cache_utils.d.ts +29 -0
  141. package/types/cache_utils.d.ts.map +1 -0
  142. package/types/configs.d.ts.map +1 -1
  143. package/types/env.d.ts +60 -27
  144. package/types/env.d.ts.map +1 -1
  145. package/types/generation/logits_sampler.d.ts +2 -2
  146. package/types/generation/logits_sampler.d.ts.map +1 -1
  147. package/types/generation/parameters.d.ts +1 -1
  148. package/types/generation/parameters.d.ts.map +1 -1
  149. package/types/generation/streamers.d.ts +1 -0
  150. package/types/generation/streamers.d.ts.map +1 -1
  151. package/types/image_processors_utils.d.ts +18 -1
  152. package/types/image_processors_utils.d.ts.map +1 -1
  153. package/types/models/afmoe/modeling_afmoe.d.ts +8 -0
  154. package/types/models/afmoe/modeling_afmoe.d.ts.map +1 -0
  155. package/types/models/{ast/modeling_ast.d.ts → audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.d.ts} +1 -1
  156. package/types/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.d.ts.map +1 -0
  157. package/types/models/auto/image_processing_auto.d.ts.map +1 -1
  158. package/types/models/auto/modeling_auto.d.ts +6 -0
  159. package/types/models/auto/modeling_auto.d.ts.map +1 -1
  160. package/types/models/auto/tokenization_auto.d.ts.map +1 -1
  161. package/types/models/chmv2/image_processing_chmv2.d.ts +4 -0
  162. package/types/models/chmv2/image_processing_chmv2.d.ts.map +1 -0
  163. package/types/models/chmv2/modeling_chmv2.d.ts +6 -0
  164. package/types/models/chmv2/modeling_chmv2.d.ts.map +1 -0
  165. package/types/models/clap/feature_extraction_clap.d.ts.map +1 -1
  166. package/types/models/cohere2/modeling_cohere2.d.ts +8 -0
  167. package/types/models/cohere2/modeling_cohere2.d.ts.map +1 -0
  168. package/types/models/cohere_asr/feature_extraction_cohere_asr.d.ts +25 -0
  169. package/types/models/cohere_asr/feature_extraction_cohere_asr.d.ts.map +1 -0
  170. package/types/models/cohere_asr/modeling_cohere_asr.d.ts +9 -0
  171. package/types/models/cohere_asr/modeling_cohere_asr.d.ts.map +1 -0
  172. package/types/models/cohere_asr/processing_cohere_asr.d.ts +27 -0
  173. package/types/models/cohere_asr/processing_cohere_asr.d.ts.map +1 -0
  174. package/types/models/cohere_asr/tokenization_cohere_asr.d.ts +4 -0
  175. package/types/models/cohere_asr/tokenization_cohere_asr.d.ts.map +1 -0
  176. package/types/models/deepseek_v3/modeling_deepseek_v3.d.ts +8 -0
  177. package/types/models/deepseek_v3/modeling_deepseek_v3.d.ts.map +1 -0
  178. package/types/models/detr/image_processing_detr.d.ts +1 -1
  179. package/types/models/eurobert/modeling_eurobert.d.ts +36 -0
  180. package/types/models/eurobert/modeling_eurobert.d.ts.map +1 -0
  181. package/types/models/feature_extractors.d.ts +3 -0
  182. package/types/models/gemma3/image_processing_gemma3.d.ts +4 -0
  183. package/types/models/gemma3/image_processing_gemma3.d.ts.map +1 -0
  184. package/types/models/gemma3/modeling_gemma3.d.ts +4 -1
  185. package/types/models/gemma3/modeling_gemma3.d.ts.map +1 -1
  186. package/types/models/gemma3/processing_gemma3.d.ts +20 -0
  187. package/types/models/gemma3/processing_gemma3.d.ts.map +1 -0
  188. package/types/models/gemma3n/modeling_gemma3n.d.ts +2 -0
  189. package/types/models/gemma3n/modeling_gemma3n.d.ts.map +1 -1
  190. package/types/models/glm46v/image_processing_glm46v.d.ts +4 -0
  191. package/types/models/glm46v/image_processing_glm46v.d.ts.map +1 -0
  192. package/types/models/glm46v/processing_glm46v.d.ts +4 -0
  193. package/types/models/glm46v/processing_glm46v.d.ts.map +1 -0
  194. package/types/models/glm_moe_dsa/modeling_glm_moe_dsa.d.ts +8 -0
  195. package/types/models/glm_moe_dsa/modeling_glm_moe_dsa.d.ts.map +1 -0
  196. package/types/models/glm_ocr/modeling_glm_ocr.d.ts +26 -0
  197. package/types/models/glm_ocr/modeling_glm_ocr.d.ts.map +1 -0
  198. package/types/models/granite_speech/feature_extraction_granite_speech.d.ts +16 -0
  199. package/types/models/granite_speech/feature_extraction_granite_speech.d.ts.map +1 -0
  200. package/types/models/granite_speech/modeling_granite_speech.d.ts +4 -0
  201. package/types/models/granite_speech/modeling_granite_speech.d.ts.map +1 -0
  202. package/types/models/granite_speech/processing_granite_speech.d.ts +19 -0
  203. package/types/models/granite_speech/processing_granite_speech.d.ts.map +1 -0
  204. package/types/models/grounding_dino/image_processing_grounding_dino.d.ts +1 -1
  205. package/types/models/idefics3/modeling_idefics3.d.ts +2 -18
  206. package/types/models/idefics3/modeling_idefics3.d.ts.map +1 -1
  207. package/types/models/image_processors.d.ts +4 -0
  208. package/types/models/lfm2_vl/image_processing_lfm2_vl.d.ts +41 -0
  209. package/types/models/lfm2_vl/image_processing_lfm2_vl.d.ts.map +1 -0
  210. package/types/models/lfm2_vl/modeling_lfm2_vl.d.ts +4 -0
  211. package/types/models/lfm2_vl/modeling_lfm2_vl.d.ts.map +1 -0
  212. package/types/models/lfm2_vl/processing_lfm2_vl.d.ts +18 -0
  213. package/types/models/lfm2_vl/processing_lfm2_vl.d.ts.map +1 -0
  214. package/types/models/lighton_ocr/modeling_lighton_ocr.d.ts +4 -0
  215. package/types/models/lighton_ocr/modeling_lighton_ocr.d.ts.map +1 -0
  216. package/types/models/marian/tokenization_marian.d.ts.map +1 -1
  217. package/types/models/mistral3/modeling_mistral3.d.ts +2 -2
  218. package/types/models/mistral3/modeling_mistral3.d.ts.map +1 -1
  219. package/types/models/mistral4/modeling_mistral4.d.ts +8 -0
  220. package/types/models/mistral4/modeling_mistral4.d.ts.map +1 -0
  221. package/types/models/modeling_utils.d.ts +46 -27
  222. package/types/models/modeling_utils.d.ts.map +1 -1
  223. package/types/models/models.d.ts +26 -1
  224. package/types/models/nemotron_h/modeling_nemotron_h.d.ts +8 -0
  225. package/types/models/nemotron_h/modeling_nemotron_h.d.ts.map +1 -0
  226. package/types/models/olmo_hybrid/modeling_olmo_hybrid.d.ts +8 -0
  227. package/types/models/olmo_hybrid/modeling_olmo_hybrid.d.ts.map +1 -0
  228. package/types/models/paligemma/modeling_paligemma.d.ts +2 -8
  229. package/types/models/paligemma/modeling_paligemma.d.ts.map +1 -1
  230. package/types/models/paligemma/processing_paligemma.d.ts.map +1 -1
  231. package/types/models/processors.d.ts +8 -0
  232. package/types/models/qwen2_5_vl/modeling_qwen2_5_vl.d.ts +7 -0
  233. package/types/models/qwen2_5_vl/modeling_qwen2_5_vl.d.ts.map +1 -0
  234. package/types/models/qwen2_5_vl/processing_qwen2_5_vl.d.ts +4 -0
  235. package/types/models/qwen2_5_vl/processing_qwen2_5_vl.d.ts.map +1 -0
  236. package/types/models/qwen2_moe/modeling_qwen2_moe.d.ts +8 -0
  237. package/types/models/qwen2_moe/modeling_qwen2_moe.d.ts.map +1 -0
  238. package/types/models/qwen2_vl/image_processing_qwen2_vl.d.ts +3 -0
  239. package/types/models/qwen2_vl/image_processing_qwen2_vl.d.ts.map +1 -1
  240. package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts +44 -6
  241. package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts.map +1 -1
  242. package/types/models/qwen2_vl/processing_qwen2_vl.d.ts +1 -0
  243. package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -1
  244. package/types/models/qwen3_5/modeling_qwen3_5.d.ts +6 -0
  245. package/types/models/qwen3_5/modeling_qwen3_5.d.ts.map +1 -0
  246. package/types/models/qwen3_5_moe/modeling_qwen3_5_moe.d.ts +7 -0
  247. package/types/models/qwen3_5_moe/modeling_qwen3_5_moe.d.ts.map +1 -0
  248. package/types/models/qwen3_moe/modeling_qwen3_moe.d.ts +8 -0
  249. package/types/models/qwen3_moe/modeling_qwen3_moe.d.ts.map +1 -0
  250. package/types/models/qwen3_next/modeling_qwen3_next.d.ts +8 -0
  251. package/types/models/qwen3_next/modeling_qwen3_next.d.ts.map +1 -0
  252. package/types/models/qwen3_vl/modeling_qwen3_vl.d.ts +7 -0
  253. package/types/models/qwen3_vl/modeling_qwen3_vl.d.ts.map +1 -0
  254. package/types/models/qwen3_vl/processing_qwen3_vl.d.ts +4 -0
  255. package/types/models/qwen3_vl/processing_qwen3_vl.d.ts.map +1 -0
  256. package/types/models/qwen3_vl_moe/modeling_qwen3_vl_moe.d.ts +7 -0
  257. package/types/models/qwen3_vl_moe/modeling_qwen3_vl_moe.d.ts.map +1 -0
  258. package/types/models/registry.d.ts +2 -1
  259. package/types/models/registry.d.ts.map +1 -1
  260. package/types/models/sam/image_processing_sam.d.ts +1 -1
  261. package/types/models/session.d.ts +3 -2
  262. package/types/models/session.d.ts.map +1 -1
  263. package/types/models/smolvlm/modeling_smolvlm.d.ts +8 -0
  264. package/types/models/smolvlm/modeling_smolvlm.d.ts.map +1 -0
  265. package/types/models/solar_open/modeling_solar_open.d.ts +8 -0
  266. package/types/models/solar_open/modeling_solar_open.d.ts.map +1 -0
  267. package/types/models/tokenizers.d.ts +1 -0
  268. package/types/models/ultravox/modeling_ultravox.d.ts +0 -2
  269. package/types/models/ultravox/modeling_ultravox.d.ts.map +1 -1
  270. package/types/models/voxtral/modeling_voxtral.d.ts +4 -0
  271. package/types/models/voxtral/modeling_voxtral.d.ts.map +1 -0
  272. package/types/models/voxtral_realtime/feature_extraction_voxtral_realtime.d.ts +28 -0
  273. package/types/models/voxtral_realtime/feature_extraction_voxtral_realtime.d.ts.map +1 -0
  274. package/types/models/voxtral_realtime/modeling_voxtral_realtime.d.ts +17 -0
  275. package/types/models/voxtral_realtime/modeling_voxtral_realtime.d.ts.map +1 -0
  276. package/types/models/voxtral_realtime/processing_voxtral_realtime.d.ts +44 -0
  277. package/types/models/voxtral_realtime/processing_voxtral_realtime.d.ts.map +1 -0
  278. package/types/models/whisper/feature_extraction_whisper.d.ts.map +1 -1
  279. package/types/models/whisper/modeling_whisper.d.ts.map +1 -1
  280. package/types/models/xlm/tokenization_xlm.d.ts.map +1 -1
  281. package/types/pipelines/automatic-speech-recognition.d.ts +7 -2
  282. package/types/pipelines/automatic-speech-recognition.d.ts.map +1 -1
  283. package/types/pipelines/document-question-answering.d.ts +2 -2
  284. package/types/pipelines/document-question-answering.d.ts.map +1 -1
  285. package/types/pipelines/image-to-text.d.ts +4 -4
  286. package/types/pipelines/image-to-text.d.ts.map +1 -1
  287. package/types/pipelines/index.d.ts +265 -0
  288. package/types/pipelines/index.d.ts.map +1 -0
  289. package/types/pipelines/summarization.d.ts +2 -2
  290. package/types/pipelines/summarization.d.ts.map +1 -1
  291. package/types/pipelines/text-generation.d.ts +7 -3
  292. package/types/pipelines/text-generation.d.ts.map +1 -1
  293. package/types/pipelines/text-to-audio.d.ts.map +1 -1
  294. package/types/pipelines/text2text-generation.d.ts +3 -3
  295. package/types/pipelines/text2text-generation.d.ts.map +1 -1
  296. package/types/pipelines/translation.d.ts +2 -2
  297. package/types/pipelines/translation.d.ts.map +1 -1
  298. package/types/pipelines/zero-shot-classification.d.ts.map +1 -1
  299. package/types/pipelines.d.ts +51 -291
  300. package/types/pipelines.d.ts.map +1 -1
  301. package/types/tokenization_utils.d.ts +44 -26
  302. package/types/tokenization_utils.d.ts.map +1 -1
  303. package/types/transformers.d.ts +7 -1
  304. package/types/transformers.d.ts.map +1 -1
  305. package/types/utils/audio.d.ts +5 -2
  306. package/types/utils/audio.d.ts.map +1 -1
  307. package/types/utils/cache/CrossOriginStorageCache.d.ts +120 -0
  308. package/types/utils/cache/CrossOriginStorageCache.d.ts.map +1 -0
  309. package/types/utils/cache/FileCache.d.ts +39 -0
  310. package/types/utils/cache/FileCache.d.ts.map +1 -0
  311. package/types/utils/cache.d.ts +10 -4
  312. package/types/utils/cache.d.ts.map +1 -1
  313. package/types/utils/core.d.ts +59 -2
  314. package/types/utils/core.d.ts.map +1 -1
  315. package/types/utils/devices.d.ts +15 -0
  316. package/types/utils/devices.d.ts.map +1 -1
  317. package/types/utils/dtypes.d.ts +17 -1
  318. package/types/utils/dtypes.d.ts.map +1 -1
  319. package/types/utils/hub/{files.d.ts → FileResponse.d.ts} +1 -32
  320. package/types/utils/hub/FileResponse.d.ts.map +1 -0
  321. package/types/utils/hub/utils.d.ts +19 -3
  322. package/types/utils/hub/utils.d.ts.map +1 -1
  323. package/types/utils/hub.d.ts +36 -7
  324. package/types/utils/hub.d.ts.map +1 -1
  325. package/types/utils/image.d.ts +1 -1
  326. package/types/utils/logger.d.ts +28 -0
  327. package/types/utils/logger.d.ts.map +1 -0
  328. package/types/utils/lru_cache.d.ts +38 -0
  329. package/types/utils/lru_cache.d.ts.map +1 -0
  330. package/types/utils/memoize_promise.d.ts +14 -0
  331. package/types/utils/memoize_promise.d.ts.map +1 -0
  332. package/types/utils/model-loader.d.ts +15 -0
  333. package/types/utils/model-loader.d.ts.map +1 -1
  334. package/types/utils/model_registry/ModelRegistry.d.ts +298 -0
  335. package/types/utils/model_registry/ModelRegistry.d.ts.map +1 -0
  336. package/types/utils/model_registry/clear_cache.d.ts +74 -0
  337. package/types/utils/model_registry/clear_cache.d.ts.map +1 -0
  338. package/types/utils/model_registry/get_available_dtypes.d.ts +26 -0
  339. package/types/utils/model_registry/get_available_dtypes.d.ts.map +1 -0
  340. package/types/utils/model_registry/get_file_metadata.d.ts +20 -0
  341. package/types/utils/model_registry/get_file_metadata.d.ts.map +1 -0
  342. package/types/utils/model_registry/get_files.d.ts +23 -0
  343. package/types/utils/model_registry/get_files.d.ts.map +1 -0
  344. package/types/utils/model_registry/get_model_files.d.ts +48 -0
  345. package/types/utils/model_registry/get_model_files.d.ts.map +1 -0
  346. package/types/utils/model_registry/get_pipeline_files.d.ts +22 -0
  347. package/types/utils/model_registry/get_pipeline_files.d.ts.map +1 -0
  348. package/types/utils/model_registry/get_processor_files.d.ts +9 -0
  349. package/types/utils/model_registry/get_processor_files.d.ts.map +1 -0
  350. package/types/utils/model_registry/get_tokenizer_files.d.ts +9 -0
  351. package/types/utils/model_registry/get_tokenizer_files.d.ts.map +1 -0
  352. package/types/utils/model_registry/is_cached.d.ts +105 -0
  353. package/types/utils/model_registry/is_cached.d.ts.map +1 -0
  354. package/types/utils/model_registry/resolve_model_type.d.ts +24 -0
  355. package/types/utils/model_registry/resolve_model_type.d.ts.map +1 -0
  356. package/types/utils/random.d.ts +86 -0
  357. package/types/utils/random.d.ts.map +1 -0
  358. package/types/utils/tensor.d.ts.map +1 -1
  359. package/src/utils/data-structures.js +0 -572
  360. package/types/models/ast/modeling_ast.d.ts.map +0 -1
  361. package/types/utils/data-structures.d.ts +0 -294
  362. package/types/utils/data-structures.d.ts.map +0 -1
  363. package/types/utils/hub/files.d.ts.map +0 -1
  364. /package/src/models/{ast/modeling_ast.js → audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.js} +0 -0
@@ -17,6 +17,172 @@ export class Qwen2VLPreTrainedModel extends PreTrainedModel {
17
17
  ];
18
18
  }
19
19
  export class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
20
+ // NOTE: This is used as the base class for all Qwen VL models and their CausalLM variants.
21
+ // CausalLM variants (e.g., Qwen2VLForCausalLM) extend this class but load only
22
+ // embed_tokens + decoder_model_merged (no vision_encoder) via MultimodalLanguageModelOnly type.
23
+ image_grid_thw_name = 'grid_thw';
24
+
25
+ /**
26
+ * Compute text-only 3D rope position IDs (all 3 dims get the same 1D positions).
27
+ * @param {Tensor} input_ids
28
+ * @param {Tensor} attention_mask
29
+ * @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas]
30
+ */
31
+ _get_text_only_rope_index(input_ids, attention_mask) {
32
+ if (attention_mask) {
33
+ const { data, dims } = cumsum_masked_fill(attention_mask);
34
+
35
+ const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]);
36
+ /** @type {bigint[]} */
37
+ const mrope_position_deltas = Array.from(
38
+ { length: dims[0] },
39
+ (_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1]),
40
+ );
41
+
42
+ return [
43
+ new Tensor('int64', position_ids, [3, ...dims]),
44
+ new Tensor('int64', mrope_position_deltas, [mrope_position_deltas.length, 1]),
45
+ ];
46
+ } else {
47
+ const [batch_size, seq_length] = input_ids.dims;
48
+ const position_ids = BigInt64Array.from({ length: 3 * batch_size * seq_length }, (_, i) =>
49
+ BigInt(Math.floor((i % seq_length) / batch_size)),
50
+ );
51
+
52
+ return [new Tensor('int64', position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])];
53
+ }
54
+ }
55
+
56
+ /**
57
+ * Reorder per-segment position ID lists from [seg1[t,h,w], seg2[t,h,w], ...] into
58
+ * global [all_t, all_h, all_w] order, then write back into the position_ids array
59
+ * respecting attention mask.
60
+ * @param {number[][]} llm_pos_ids_list List of per-segment position arrays, each of length 3*seg_len
61
+ * @param {number[]} attn_mask Attention mask for this batch element
62
+ * @param {number[][][]} position_ids_list [3][batch][seq] output array to write into
63
+ * @param {number} batch_idx Current batch index
64
+ * @returns {number[]} Flat reordered positions of length total_len
65
+ */
66
+ _reorder_and_write_positions(llm_pos_ids_list, attn_mask, position_ids_list, batch_idx) {
67
+ const total_len = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
68
+ const llm_positions = new Array(total_len);
69
+ let index = 0;
70
+ for (let x = 0; x < 3; ++x) {
71
+ for (const val of llm_pos_ids_list) {
72
+ const seg_len = val.length / 3;
73
+ for (let z = x * seg_len; z < (x + 1) * seg_len; ++z) {
74
+ llm_positions[index++] = val[z];
75
+ }
76
+ }
77
+ }
78
+
79
+ let count = 0;
80
+ for (let y = 0; y < attn_mask.length; ++y) {
81
+ if (attn_mask[y] == 1) {
82
+ for (let x = 0; x < 3; ++x) {
83
+ position_ids_list[x][batch_idx][y] = llm_positions[(x * total_len) / 3 + count];
84
+ }
85
+ ++count;
86
+ }
87
+ }
88
+
89
+ return llm_positions;
90
+ }
91
+
92
+ /**
93
+ * Build per-batch position ID segments for multimodal rope.
94
+ * Override this in subclasses to change how vision/text segments are identified and positioned.
95
+ * @param {object} params
96
+ * @param {any[]} params.filtered_ids - attention-masked token IDs for this batch element
97
+ * @param {any[][]} params.image_grid_thw_list - all image grid dimensions
98
+ * @param {any[][]} params.video_grid_thw_list - all video grid dimensions
99
+ * @param {number} params.spatial_merge_size
100
+ * @param {{image_index: number, video_index: number}} params.state - mutable counters shared across batches
101
+ * @returns {number[][]} llm_pos_ids_list - segments of [t..., h..., w...] positions
102
+ */
103
+ _get_multimodal_rope_positions({
104
+ filtered_ids,
105
+ image_grid_thw_list,
106
+ video_grid_thw_list,
107
+ spatial_merge_size,
108
+ state,
109
+ }) {
110
+ // @ts-ignore
111
+ const { image_token_id, video_token_id, vision_start_token_id } = this.config;
112
+
113
+ const ids = filtered_ids;
114
+ const vision_start_indices = ids.reduce((acc, x, idx) => {
115
+ if (x == vision_start_token_id) acc.push(idx);
116
+ return acc;
117
+ }, []);
118
+
119
+ const vision_tokens = vision_start_indices.map((x) => ids[x + 1]);
120
+ const image_nums = vision_tokens.filter((x) => x == image_token_id).length;
121
+ const video_nums = vision_tokens.filter((x) => x == video_token_id).length;
122
+
123
+ /** @type {number[][]} */
124
+ const llm_pos_ids_list = [];
125
+ let st = 0;
126
+ let remain_images = image_nums;
127
+ let remain_videos = video_nums;
128
+ for (let j = 0; j < vision_tokens.length; ++j) {
129
+ const next_image_token = ids.findIndex((x, i) => i > st && x == image_token_id);
130
+ const next_video_token = ids.findIndex((x, i) => i > st && x == video_token_id);
131
+
132
+ const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1;
133
+ const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1;
134
+
135
+ let ed;
136
+ let t, h, w;
137
+ if (ed_image < ed_video) {
138
+ [t, h, w] = image_grid_thw_list[state.image_index];
139
+ ++state.image_index;
140
+ --remain_images;
141
+ ed = ed_image;
142
+ } else {
143
+ [t, h, w] = video_grid_thw_list[state.video_index];
144
+ ++state.video_index;
145
+ --remain_videos;
146
+ ed = ed_video;
147
+ }
148
+
149
+ const [llm_grid_t, llm_grid_h, llm_grid_w] = [
150
+ Number(t),
151
+ Math.floor(Number(h) / spatial_merge_size),
152
+ Math.floor(Number(w) / spatial_merge_size),
153
+ ];
154
+ const text_len = ed - st;
155
+ const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
156
+
157
+ llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + (i % text_len)));
158
+
159
+ const offset = text_len + st_idx;
160
+ const grid_size = llm_grid_t * llm_grid_h * llm_grid_w;
161
+ const t_index = Array.from(
162
+ { length: grid_size },
163
+ (_, i) => offset + Math.floor(i / (llm_grid_h * llm_grid_w)),
164
+ );
165
+ const h_index = Array.from(
166
+ { length: grid_size },
167
+ (_, i) => offset + (Math.floor(i / llm_grid_w) % llm_grid_h),
168
+ );
169
+ const w_index = Array.from({ length: grid_size }, (_, i) => offset + (i % llm_grid_w));
170
+
171
+ llm_pos_ids_list.push([t_index, h_index, w_index].flat());
172
+
173
+ st = ed + grid_size;
174
+ }
175
+
176
+ if (st < ids.length) {
177
+ const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
178
+ const text_len = ids.length - st;
179
+
180
+ llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + (i % text_len)));
181
+ }
182
+
183
+ return llm_pos_ids_list;
184
+ }
185
+
20
186
  /**
21
187
  * Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
22
188
  *
@@ -46,137 +212,49 @@ export class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
46
212
  * @param {Tensor} input_ids Indices of input sequence tokens in the vocabulary. Tensor of shape `(batch_size, sequence_length)`.
47
213
  * @param {Tensor} image_grid_thw (Optional) The temporal, height and width of feature shape of each image in LLM. Tensor of shape `(num_images, 3)`.
48
214
  * @param {Tensor} video_grid_thw (Optional) The temporal, height and width of feature shape of each video in LLM. Tensor of shape `(num_videos, 3)`.
49
- * @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`. Mask values selected in `[0, 1]`:
50
- * - 1 for tokens that are **not masked**,
51
- * - 0 for tokens that are **masked**.
52
- * @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas] with:
53
- * - position_ids: Tensor of shape `(3, batch_size, sequence_length)`.
54
- * - mrope_position_deltas: Tensor of shape `(batch_size)`.
215
+ * @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`.
216
+ * @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas]
55
217
  */
56
218
  get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) {
57
219
  // @ts-ignore
58
- const { vision_config, image_token_id, video_token_id, vision_start_token_id } = this.config;
220
+ const { vision_config } = this.config;
59
221
  const spatial_merge_size = vision_config.spatial_merge_size ?? 2;
60
222
 
61
- const mrope_position_deltas = [];
62
223
  if (image_grid_thw || video_grid_thw) {
63
- let total_input_ids = input_ids.tolist();
224
+ const total_input_ids = input_ids.tolist();
64
225
  if (!attention_mask) {
65
226
  attention_mask = ones_like(input_ids);
66
227
  }
67
228
 
68
229
  const attention_mask_list = attention_mask.tolist();
69
- const position_ids_list = Array.from({ length: 3 }, (_) =>
70
- Array.from({ length: input_ids.dims[0] }, (_) => Array.from({ length: input_ids.dims[1] }, (_) => 1)),
230
+ const position_ids_list = Array.from({ length: 3 }, () =>
231
+ Array.from({ length: input_ids.dims[0] }, () => Array.from({ length: input_ids.dims[1] }, () => 0)),
71
232
  );
72
233
 
73
234
  const image_grid_thw_list = image_grid_thw ? image_grid_thw.tolist() : [];
74
235
  const video_grid_thw_list = video_grid_thw ? video_grid_thw.tolist() : [];
236
+ const state = { image_index: 0, video_index: 0 };
75
237
 
76
- let image_index = 0;
77
- let video_index = 0;
238
+ const mrope_position_deltas = [];
78
239
  for (let i = 0; i < total_input_ids.length; ++i) {
79
- const ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1);
80
-
81
- const vision_start_indices = ids.reduce((acc, x, idx) => {
82
- if (x == vision_start_token_id) acc.push(idx);
83
- return acc;
84
- }, []);
85
-
86
- const vision_tokens = vision_start_indices.map((x) => ids[x + 1]);
87
- const image_nums = vision_tokens.filter((x) => x == image_token_id).length;
88
- const video_nums = vision_tokens.filter((x) => x == video_token_id).length;
89
-
90
- /** @type {number[][]} */
91
- let llm_pos_ids_list = [];
92
- let st = 0;
93
- let remain_images = image_nums;
94
- let remain_videos = video_nums;
95
- for (let j = 0; j < vision_tokens.length; ++j) {
96
- const next_image_token = ids.findIndex((x, i) => i > st && x == image_token_id);
97
- const next_video_token = ids.findIndex((x, i) => i > st && x == video_token_id);
98
-
99
- const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1;
100
-
101
- const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1;
102
-
103
- let ed;
104
- let t, h, w;
105
- if (ed_image < ed_video) {
106
- [t, h, w] = image_grid_thw_list[image_index];
107
- ++image_index;
108
- --remain_images;
109
- ed = ed_image;
110
- } else {
111
- [t, h, w] = video_grid_thw_list[video_index];
112
- ++video_index;
113
- --remain_videos;
114
- ed = ed_video;
115
- }
116
-
117
- const [llm_grid_t, llm_grid_h, llm_grid_w] = [
118
- Number(t),
119
- Math.floor(Number(h) / spatial_merge_size),
120
- Math.floor(Number(w) / spatial_merge_size),
121
- ];
122
- const text_len = ed - st;
123
- const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
124
-
125
- llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + (i % text_len)));
126
-
127
- const offset = text_len + st_idx;
128
- const grid_size = llm_grid_t * llm_grid_h * llm_grid_w;
129
- const t_index = Array.from(
130
- { length: grid_size },
131
- (_, i) => offset + Math.floor(i / (llm_grid_h * llm_grid_w)),
132
- );
133
- const h_index = Array.from(
134
- { length: grid_size },
135
- (_, i) => offset + (Math.floor(i / llm_grid_w) % llm_grid_h),
136
- );
137
- const w_index = Array.from({ length: grid_size }, (_, i) => offset + (i % llm_grid_w));
138
-
139
- llm_pos_ids_list.push([t_index, h_index, w_index].flat());
140
-
141
- st = ed + grid_size;
142
- }
143
-
144
- if (st < ids.length) {
145
- const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
146
- const text_len = ids.length - st;
240
+ const filtered_ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1);
147
241
 
148
- llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + (i % text_len)));
149
- }
242
+ const llm_pos_ids_list = this._get_multimodal_rope_positions({
243
+ filtered_ids,
244
+ image_grid_thw_list,
245
+ video_grid_thw_list,
246
+ spatial_merge_size,
247
+ state,
248
+ });
150
249
 
151
- // NOTE: Each item in llm_pos_ids_list is an array of shape (3, text_len),
152
- // meaning to perform concatenation along dim=1, we can do the following:
153
- const num_items = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
154
- /** @type {number[]} */
155
- const llm_positions = new Array(num_items);
156
- let index = 0;
157
- for (let x = 0; x < 3; ++x) {
158
- for (let y = 0; y < llm_pos_ids_list.length; ++y) {
159
- const val = llm_pos_ids_list[y];
160
- const text_len = val.length / 3;
161
- for (let z = x * text_len; z < (x + 1) * text_len; ++z) {
162
- llm_positions[index++] = val[z];
163
- }
164
- }
165
- }
166
-
167
- let count = 0;
168
- const attn_mask = attention_mask_list[i];
169
- for (let y = 0; y < attn_mask.length; ++y) {
170
- if (attn_mask[y] == 1) {
171
- for (let x = 0; x < 3; ++x) {
172
- position_ids_list[x][i][y] = llm_positions[(x * num_items) / 3 + count];
173
- }
174
- ++count;
175
- }
176
- }
250
+ const llm_positions = this._reorder_and_write_positions(
251
+ llm_pos_ids_list,
252
+ attention_mask_list[i],
253
+ position_ids_list,
254
+ i,
255
+ );
177
256
 
178
- const max_llm_positions = max(llm_positions)[0];
179
- mrope_position_deltas.push(max_llm_positions + 1 - total_input_ids[i].length);
257
+ mrope_position_deltas.push(max(llm_positions)[0] + 1 - total_input_ids[i].length);
180
258
  }
181
259
 
182
260
  return [
@@ -184,35 +262,17 @@ export class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
184
262
  new Tensor('int64', mrope_position_deltas, [mrope_position_deltas.length, 1]),
185
263
  ];
186
264
  } else {
187
- // Text-only
188
- if (attention_mask) {
189
- const { data, dims } = cumsum_masked_fill(attention_mask);
190
-
191
- const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]);
192
- /** @type {bigint[]} */
193
- const mrope_position_deltas = Array.from(
194
- { length: dims[0] },
195
- (_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1]),
196
- );
197
-
198
- return [
199
- new Tensor('int64', position_ids, [3, ...dims]),
200
- new Tensor('int64', mrope_position_deltas, [mrope_position_deltas.length, 1]),
201
- ];
202
- } else {
203
- const [batch_size, seq_length] = input_ids.dims;
204
- const position_ids = BigInt64Array.from({ length: 3 * batch_size * seq_length }, (_, i) =>
205
- BigInt(Math.floor((i % seq_length) / batch_size)),
206
- );
207
-
208
- return [new Tensor('int64', position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])];
209
- }
265
+ return this._get_text_only_rope_index(input_ids, attention_mask);
210
266
  }
211
267
  }
212
268
 
213
269
  async encode_image({ pixel_values, image_grid_thw }) {
214
- const features = (await sessionRun(this.sessions['vision_encoder'], { pixel_values, grid_thw: image_grid_thw }))
215
- .image_features;
270
+ const features = (
271
+ await sessionRun(this.sessions['vision_encoder'], {
272
+ pixel_values,
273
+ [this.image_grid_thw_name]: image_grid_thw,
274
+ })
275
+ ).image_features;
216
276
  return features;
217
277
  }
218
278
 
@@ -226,20 +286,55 @@ export class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
226
286
 
227
287
  prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
228
288
  // Overwritten -- in specific circumstances we don't want to forward image inputs to the model
229
- if (model_inputs.attention_mask && !model_inputs.position_ids) {
230
- // Calculate position_ids and rope_deltas
231
- if (!model_inputs.past_key_values) {
232
- [model_inputs.position_ids, model_inputs.rope_deltas] = this.get_rope_index(
289
+ if (!model_inputs.attention_mask || model_inputs.position_ids) {
290
+ return model_inputs;
291
+ }
292
+
293
+ const session = this.sessions['decoder_model_merged'] ?? this.sessions['model'];
294
+ if (!session.inputNames.includes('position_ids')) {
295
+ return model_inputs;
296
+ }
297
+
298
+ // Calculate position_ids and rope_deltas
299
+ if (!model_inputs.past_key_values) {
300
+ [model_inputs.position_ids, model_inputs.rope_deltas] = this.get_rope_index(
301
+ model_inputs.input_ids,
302
+ model_inputs.image_grid_thw,
303
+ model_inputs.video_grid_thw,
304
+ model_inputs.attention_mask,
305
+ );
306
+ } else {
307
+ model_inputs.pixel_values = null;
308
+ // model_inputs.pixel_values_videos = null;
309
+
310
+ const past_length = model_inputs.past_key_values.get_seq_length();
311
+
312
+ if (past_length < model_inputs.input_ids.dims[1]) {
313
+ // Externally provided `past_key_values` with full input_ids:
314
+ // Compute full position_ids, then slice to only the new (unprocessed) tokens.
315
+ const [full_position_ids, rope_deltas] = this.get_rope_index(
233
316
  model_inputs.input_ids,
234
317
  model_inputs.image_grid_thw,
235
318
  model_inputs.video_grid_thw,
236
319
  model_inputs.attention_mask,
237
320
  );
321
+ model_inputs.rope_deltas = rope_deltas;
322
+ model_inputs.position_ids = full_position_ids.slice(null, null, [past_length, null]);
323
+ model_inputs.input_ids = model_inputs.input_ids.slice(null, [past_length, null]);
238
324
  } else {
239
- model_inputs.pixel_values = null;
240
- // model_inputs.pixel_values_videos = null;
325
+ // Auto-regressive case: single new token.
326
+ // `rope_deltas` may be absent when generation starts from externally provided `past_key_values`.
327
+ // In that case, recompute from current inputs instead of relying on persisted model state.
328
+ if (!model_inputs.rope_deltas) {
329
+ [, model_inputs.rope_deltas] = this.get_rope_index(
330
+ model_inputs.input_ids,
331
+ model_inputs.image_grid_thw,
332
+ model_inputs.video_grid_thw,
333
+ model_inputs.attention_mask,
334
+ );
335
+ }
241
336
 
242
- const delta = BigInt(Object.values(model_inputs.past_key_values)[0].dims.at(-2));
337
+ const delta = BigInt(past_length);
243
338
  const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x);
244
339
  model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0);
245
340
  }
@@ -248,3 +343,5 @@ export class Qwen2VLForConditionalGeneration extends Qwen2VLPreTrainedModel {
248
343
  return model_inputs;
249
344
  }
250
345
  }
346
+
347
+ export class Qwen2VLForCausalLM extends Qwen2VLForConditionalGeneration {}
@@ -6,6 +6,7 @@ import { RawImage } from '../../utils/image.js';
6
6
  export class Qwen2VLProcessor extends Processor {
7
7
  static image_processor_class = AutoImageProcessor;
8
8
  static tokenizer_class = AutoTokenizer;
9
+ static image_token = '<|image_pad|>';
9
10
 
10
11
  /**
11
12
  *
@@ -31,13 +32,14 @@ export class Qwen2VLProcessor extends Processor {
31
32
  let merge_length = this.image_processor.config.merge_size ** 2;
32
33
  let index = 0;
33
34
 
35
+ const image_token = /** @type {typeof Qwen2VLProcessor} */ (this.constructor).image_token;
34
36
  const image_grid_thw_list = image_grid_thw.tolist();
35
37
  text = text.map((t) => {
36
- while (t.includes('<|image_pad|>')) {
38
+ while (t.includes(image_token)) {
37
39
  const prod = Number(image_grid_thw_list[index++].reduce((a, b) => a * b, 1n));
38
- t = t.replace('<|image_pad|>', '<|placeholder|>'.repeat(Math.floor(prod / merge_length)));
40
+ t = t.replace(image_token, '<|placeholder|>'.repeat(Math.floor(prod / merge_length)));
39
41
  }
40
- return t.replaceAll('<|placeholder|>', '<|image_pad|>');
42
+ return t.replaceAll('<|placeholder|>', image_token);
41
43
  });
42
44
  }
43
45
 
@@ -46,7 +48,6 @@ export class Qwen2VLProcessor extends Processor {
46
48
  return {
47
49
  ...text_inputs,
48
50
  ...image_inputs,
49
- // TODO: ...videos_inputs,
50
51
  };
51
52
  }
52
53
  }
@@ -0,0 +1,4 @@
1
+ import { Qwen3VLForConditionalGeneration } from '../qwen3_vl/modeling_qwen3_vl.js';
2
+
3
+ export class Qwen3_5ForConditionalGeneration extends Qwen3VLForConditionalGeneration {}
4
+ export class Qwen3_5ForCausalLM extends Qwen3_5ForConditionalGeneration {}
@@ -0,0 +1,4 @@
1
+ import { Qwen3_5ForConditionalGeneration, Qwen3_5ForCausalLM } from '../qwen3_5/modeling_qwen3_5.js';
2
+
3
+ export class Qwen3_5MoeForConditionalGeneration extends Qwen3_5ForConditionalGeneration {}
4
+ export class Qwen3_5MoeForCausalLM extends Qwen3_5ForCausalLM {}
@@ -0,0 +1,5 @@
1
+ import { PreTrainedModel } from '../modeling_utils.js';
2
+
3
+ export class Qwen3MoePreTrainedModel extends PreTrainedModel {}
4
+ export class Qwen3MoeModel extends Qwen3MoePreTrainedModel {}
5
+ export class Qwen3MoeForCausalLM extends Qwen3MoePreTrainedModel {}
@@ -0,0 +1,5 @@
1
+ import { PreTrainedModel } from '../modeling_utils.js';
2
+
3
+ export class Qwen3NextPreTrainedModel extends PreTrainedModel {}
4
+ export class Qwen3NextModel extends Qwen3NextPreTrainedModel {}
5
+ export class Qwen3NextForCausalLM extends Qwen3NextPreTrainedModel {}
@@ -0,0 +1,4 @@
1
+ import { Qwen2_5_VLForConditionalGeneration, Qwen2_5_VLForCausalLM } from '../qwen2_5_vl/modeling_qwen2_5_vl.js';
2
+
3
+ export class Qwen3VLForConditionalGeneration extends Qwen2_5_VLForConditionalGeneration {}
4
+ export class Qwen3VLForCausalLM extends Qwen2_5_VLForCausalLM {}
@@ -0,0 +1,3 @@
1
+ import { Qwen2_5_VLProcessor } from '../qwen2_5_vl/processing_qwen2_5_vl.js';
2
+
3
+ export class Qwen3VLProcessor extends Qwen2_5_VLProcessor {}
@@ -0,0 +1,4 @@
1
+ import { Qwen3VLForConditionalGeneration, Qwen3VLForCausalLM } from '../qwen3_vl/modeling_qwen3_vl.js';
2
+
3
+ export class Qwen3VLMoeForConditionalGeneration extends Qwen3VLForConditionalGeneration {}
4
+ export class Qwen3VLMoeForCausalLM extends Qwen3VLForCausalLM {}