transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (835) hide show
  1. transformers/__init__.py +49 -3
  2. transformers/activations.py +1 -1
  3. transformers/audio_utils.py +0 -1
  4. transformers/cache_utils.py +17 -15
  5. transformers/cli/serve.py +47 -17
  6. transformers/configuration_utils.py +114 -70
  7. transformers/conversion_mapping.py +83 -7
  8. transformers/convert_slow_tokenizer.py +225 -10
  9. transformers/core_model_loading.py +374 -147
  10. transformers/data/data_collator.py +12 -4
  11. transformers/dependency_versions_table.py +2 -3
  12. transformers/dynamic_module_utils.py +1 -2
  13. transformers/feature_extraction_utils.py +55 -24
  14. transformers/file_utils.py +0 -1
  15. transformers/generation/__init__.py +11 -1
  16. transformers/generation/candidate_generator.py +79 -31
  17. transformers/generation/configuration_utils.py +165 -124
  18. transformers/generation/continuous_batching/__init__.py +4 -0
  19. transformers/generation/continuous_batching/cache.py +47 -18
  20. transformers/generation/continuous_batching/cache_manager.py +131 -34
  21. transformers/generation/continuous_batching/continuous_api.py +228 -136
  22. transformers/generation/continuous_batching/requests.py +28 -1
  23. transformers/generation/continuous_batching/scheduler.py +11 -4
  24. transformers/generation/stopping_criteria.py +1 -1
  25. transformers/generation/utils.py +108 -110
  26. transformers/generation/watermarking.py +8 -5
  27. transformers/image_processing_base.py +3 -14
  28. transformers/image_processing_utils_fast.py +15 -4
  29. transformers/initialization.py +37 -0
  30. transformers/integrations/__init__.py +16 -2
  31. transformers/integrations/accelerate.py +58 -113
  32. transformers/integrations/aqlm.py +36 -66
  33. transformers/integrations/awq.py +46 -515
  34. transformers/integrations/bitnet.py +47 -105
  35. transformers/integrations/bitsandbytes.py +91 -202
  36. transformers/integrations/deepspeed.py +18 -2
  37. transformers/integrations/eetq.py +84 -81
  38. transformers/integrations/fbgemm_fp8.py +191 -145
  39. transformers/integrations/finegrained_fp8.py +241 -208
  40. transformers/integrations/flash_attention.py +2 -2
  41. transformers/integrations/fp_quant.py +92 -0
  42. transformers/integrations/ggml.py +11 -1
  43. transformers/integrations/higgs.py +37 -62
  44. transformers/integrations/hub_kernels.py +65 -8
  45. transformers/integrations/integration_utils.py +45 -0
  46. transformers/integrations/mistral.py +12 -0
  47. transformers/integrations/moe.py +240 -0
  48. transformers/integrations/mxfp4.py +28 -74
  49. transformers/integrations/peft.py +12 -29
  50. transformers/integrations/quanto.py +77 -56
  51. transformers/integrations/quark.py +55 -0
  52. transformers/integrations/spqr.py +42 -90
  53. transformers/integrations/tensor_parallel.py +167 -221
  54. transformers/integrations/torchao.py +32 -38
  55. transformers/integrations/vptq.py +40 -59
  56. transformers/modelcard.py +1 -2
  57. transformers/modeling_gguf_pytorch_utils.py +74 -19
  58. transformers/modeling_rope_utils.py +107 -86
  59. transformers/modeling_utils.py +611 -527
  60. transformers/models/__init__.py +22 -0
  61. transformers/models/afmoe/modeling_afmoe.py +10 -19
  62. transformers/models/afmoe/modular_afmoe.py +5 -13
  63. transformers/models/aimv2/modeling_aimv2.py +4 -0
  64. transformers/models/aimv2/modular_aimv2.py +4 -0
  65. transformers/models/albert/modeling_albert.py +3 -0
  66. transformers/models/albert/tokenization_albert.py +6 -12
  67. transformers/models/align/modeling_align.py +14 -6
  68. transformers/models/altclip/modeling_altclip.py +11 -3
  69. transformers/models/apertus/modeling_apertus.py +8 -6
  70. transformers/models/apertus/modular_apertus.py +4 -1
  71. transformers/models/arcee/modeling_arcee.py +5 -5
  72. transformers/models/aria/modeling_aria.py +12 -8
  73. transformers/models/aria/modular_aria.py +7 -3
  74. transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
  75. transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
  76. transformers/models/audioflamingo3/processing_audioflamingo3.py +27 -22
  77. transformers/models/auto/auto_factory.py +1 -1
  78. transformers/models/auto/configuration_auto.py +38 -0
  79. transformers/models/auto/feature_extraction_auto.py +9 -3
  80. transformers/models/auto/image_processing_auto.py +5 -2
  81. transformers/models/auto/modeling_auto.py +37 -0
  82. transformers/models/auto/processing_auto.py +22 -10
  83. transformers/models/auto/tokenization_auto.py +147 -566
  84. transformers/models/auto/video_processing_auto.py +5 -2
  85. transformers/models/autoformer/modeling_autoformer.py +4 -0
  86. transformers/models/aya_vision/modeling_aya_vision.py +7 -3
  87. transformers/models/bamba/modeling_bamba.py +21 -21
  88. transformers/models/bamba/modular_bamba.py +17 -16
  89. transformers/models/bark/modeling_bark.py +11 -0
  90. transformers/models/bart/configuration_bart.py +0 -1
  91. transformers/models/bart/modeling_bart.py +14 -0
  92. transformers/models/barthez/tokenization_barthez.py +5 -10
  93. transformers/models/beit/image_processing_beit_fast.py +0 -1
  94. transformers/models/beit/modeling_beit.py +6 -1
  95. transformers/models/bert/modeling_bert.py +3 -0
  96. transformers/models/bert/tokenization_bert.py +8 -21
  97. transformers/models/bert_generation/modeling_bert_generation.py +2 -0
  98. transformers/models/big_bird/modeling_big_bird.py +9 -0
  99. transformers/models/big_bird/tokenization_big_bird.py +18 -42
  100. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +15 -2
  101. transformers/models/biogpt/modeling_biogpt.py +2 -0
  102. transformers/models/biogpt/modular_biogpt.py +2 -0
  103. transformers/models/bit/modeling_bit.py +16 -3
  104. transformers/models/bitnet/modeling_bitnet.py +5 -5
  105. transformers/models/blenderbot/modeling_blenderbot.py +12 -0
  106. transformers/models/blenderbot/tokenization_blenderbot.py +18 -23
  107. transformers/models/blenderbot_small/modeling_blenderbot_small.py +12 -0
  108. transformers/models/blip/modeling_blip.py +2 -0
  109. transformers/models/blip/modeling_blip_text.py +10 -0
  110. transformers/models/blip_2/modeling_blip_2.py +4 -1
  111. transformers/models/bloom/modeling_bloom.py +17 -44
  112. transformers/models/blt/modeling_blt.py +164 -4
  113. transformers/models/blt/modular_blt.py +170 -5
  114. transformers/models/bridgetower/image_processing_bridgetower_fast.py +0 -2
  115. transformers/models/bridgetower/modeling_bridgetower.py +11 -1
  116. transformers/models/bros/modeling_bros.py +12 -0
  117. transformers/models/camembert/modeling_camembert.py +109 -106
  118. transformers/models/camembert/tokenization_camembert.py +8 -12
  119. transformers/models/canine/modeling_canine.py +11 -0
  120. transformers/models/canine/tokenization_canine.py +2 -0
  121. transformers/models/chameleon/modeling_chameleon.py +11 -5
  122. transformers/models/chinese_clip/modeling_chinese_clip.py +9 -3
  123. transformers/models/clap/feature_extraction_clap.py +2 -2
  124. transformers/models/clap/modeling_clap.py +30 -15
  125. transformers/models/clip/modeling_clip.py +2 -0
  126. transformers/models/clip/tokenization_clip.py +22 -44
  127. transformers/models/clipseg/modeling_clipseg.py +9 -0
  128. transformers/models/clvp/modeling_clvp.py +19 -3
  129. transformers/models/clvp/tokenization_clvp.py +1 -63
  130. transformers/models/code_llama/tokenization_code_llama.py +20 -43
  131. transformers/models/codegen/modeling_codegen.py +13 -4
  132. transformers/models/codegen/tokenization_codegen.py +14 -43
  133. transformers/models/cohere/modeling_cohere.py +5 -4
  134. transformers/models/cohere/modular_cohere.py +2 -1
  135. transformers/models/cohere/tokenization_cohere.py +12 -42
  136. transformers/models/cohere2/modeling_cohere2.py +8 -7
  137. transformers/models/cohere2/modular_cohere2.py +5 -5
  138. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -4
  139. transformers/models/cohere2_vision/modeling_cohere2_vision.py +7 -3
  140. transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
  141. transformers/models/colqwen2/modeling_colqwen2.py +1 -0
  142. transformers/models/colqwen2/modular_colqwen2.py +1 -0
  143. transformers/models/conditional_detr/configuration_conditional_detr.py +1 -1
  144. transformers/models/conditional_detr/modeling_conditional_detr.py +9 -1
  145. transformers/models/convbert/modeling_convbert.py +9 -0
  146. transformers/models/convnext/image_processing_convnext.py +2 -2
  147. transformers/models/convnext/image_processing_convnext_fast.py +9 -13
  148. transformers/models/convnext/modeling_convnext.py +2 -4
  149. transformers/models/convnextv2/modeling_convnextv2.py +2 -4
  150. transformers/models/csm/generation_csm.py +19 -22
  151. transformers/models/csm/modeling_csm.py +7 -4
  152. transformers/models/csm/modular_csm.py +2 -0
  153. transformers/models/ctrl/modeling_ctrl.py +15 -2
  154. transformers/models/cvt/modeling_cvt.py +7 -1
  155. transformers/models/cwm/modeling_cwm.py +5 -5
  156. transformers/models/d_fine/configuration_d_fine.py +3 -4
  157. transformers/models/d_fine/modeling_d_fine.py +48 -39
  158. transformers/models/d_fine/modular_d_fine.py +16 -4
  159. transformers/models/dab_detr/configuration_dab_detr.py +2 -2
  160. transformers/models/dab_detr/modeling_dab_detr.py +5 -1
  161. transformers/models/dac/modeling_dac.py +6 -6
  162. transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
  163. transformers/models/data2vec/modeling_data2vec_text.py +7 -0
  164. transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
  165. transformers/models/data2vec/modular_data2vec_text.py +7 -0
  166. transformers/models/dbrx/configuration_dbrx.py +9 -1
  167. transformers/models/dbrx/modeling_dbrx.py +3 -3
  168. transformers/models/deberta/modeling_deberta.py +7 -0
  169. transformers/models/deberta/tokenization_deberta.py +11 -20
  170. transformers/models/deberta_v2/modeling_deberta_v2.py +8 -0
  171. transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
  172. transformers/models/decision_transformer/modeling_decision_transformer.py +12 -6
  173. transformers/models/deepseek_v2/modeling_deepseek_v2.py +9 -7
  174. transformers/models/deepseek_v2/modular_deepseek_v2.py +6 -4
  175. transformers/models/deepseek_v3/modeling_deepseek_v3.py +12 -7
  176. transformers/models/deepseek_v3/modular_deepseek_v3.py +7 -2
  177. transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +0 -1
  178. transformers/models/deepseek_vl/modeling_deepseek_vl.py +9 -5
  179. transformers/models/deepseek_vl/modular_deepseek_vl.py +3 -0
  180. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +0 -4
  181. transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +9 -5
  182. transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +9 -9
  183. transformers/models/deformable_detr/configuration_deformable_detr.py +2 -2
  184. transformers/models/deformable_detr/modeling_deformable_detr.py +5 -1
  185. transformers/models/depth_anything/configuration_depth_anything.py +2 -3
  186. transformers/models/depth_anything/modeling_depth_anything.py +1 -0
  187. transformers/models/depth_pro/image_processing_depth_pro_fast.py +0 -1
  188. transformers/models/depth_pro/modeling_depth_pro.py +2 -0
  189. transformers/models/detr/configuration_detr.py +1 -1
  190. transformers/models/detr/modeling_detr.py +13 -1
  191. transformers/models/dia/generation_dia.py +3 -10
  192. transformers/models/dia/modeling_dia.py +16 -4
  193. transformers/models/dia/modular_dia.py +11 -1
  194. transformers/models/dia/processing_dia.py +1 -1
  195. transformers/models/diffllama/modeling_diffllama.py +5 -5
  196. transformers/models/diffllama/modular_diffllama.py +2 -2
  197. transformers/models/dinat/modeling_dinat.py +3 -0
  198. transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
  199. transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +0 -1
  200. transformers/models/dinov3_vit/modeling_dinov3_vit.py +5 -2
  201. transformers/models/dinov3_vit/modular_dinov3_vit.py +5 -2
  202. transformers/models/distilbert/modeling_distilbert.py +11 -9
  203. transformers/models/distilbert/tokenization_distilbert.py +13 -0
  204. transformers/models/doge/modeling_doge.py +3 -4
  205. transformers/models/doge/modular_doge.py +0 -1
  206. transformers/models/donut/image_processing_donut_fast.py +0 -1
  207. transformers/models/donut/modeling_donut_swin.py +18 -12
  208. transformers/models/dots1/modeling_dots1.py +23 -11
  209. transformers/models/dots1/modular_dots1.py +5 -3
  210. transformers/models/dpr/modeling_dpr.py +5 -0
  211. transformers/models/dpr/tokenization_dpr.py +12 -0
  212. transformers/models/dpt/configuration_dpt.py +1 -1
  213. transformers/models/dpt/image_processing_dpt_fast.py +1 -2
  214. transformers/models/dpt/modular_dpt.py +1 -2
  215. transformers/models/edgetam/configuration_edgetam.py +1 -1
  216. transformers/models/edgetam/modeling_edgetam.py +6 -3
  217. transformers/models/edgetam/modular_edgetam.py +15 -14
  218. transformers/models/edgetam_video/modeling_edgetam_video.py +56 -43
  219. transformers/models/edgetam_video/modular_edgetam_video.py +14 -19
  220. transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +1 -2
  221. transformers/models/efficientloftr/modeling_efficientloftr.py +16 -3
  222. transformers/models/efficientnet/image_processing_efficientnet.py +5 -6
  223. transformers/models/efficientnet/image_processing_efficientnet_fast.py +1 -2
  224. transformers/models/efficientnet/modeling_efficientnet.py +7 -1
  225. transformers/models/electra/modeling_electra.py +7 -0
  226. transformers/models/emu3/modeling_emu3.py +12 -6
  227. transformers/models/emu3/modular_emu3.py +7 -1
  228. transformers/models/encodec/modeling_encodec.py +14 -0
  229. transformers/models/eomt/image_processing_eomt.py +13 -1
  230. transformers/models/eomt/image_processing_eomt_fast.py +60 -16
  231. transformers/models/eomt/modeling_eomt.py +7 -0
  232. transformers/models/eomt/modular_eomt.py +7 -0
  233. transformers/models/ernie/modeling_ernie.py +6 -0
  234. transformers/models/ernie/modular_ernie.py +6 -0
  235. transformers/models/ernie4_5/modeling_ernie4_5.py +5 -5
  236. transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
  237. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +20 -17
  238. transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +11 -37
  239. transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
  240. transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +330 -0
  241. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +456 -0
  242. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +232 -0
  243. transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1898 -0
  244. transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1904 -0
  245. transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +251 -0
  246. transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +594 -0
  247. transformers/models/esm/modeling_esm.py +6 -0
  248. transformers/models/esm/modeling_esmfold.py +11 -5
  249. transformers/models/evolla/modeling_evolla.py +13 -5
  250. transformers/models/evolla/modular_evolla.py +8 -0
  251. transformers/models/exaone4/modeling_exaone4.py +3 -3
  252. transformers/models/exaone4/modular_exaone4.py +0 -1
  253. transformers/models/falcon/modeling_falcon.py +9 -4
  254. transformers/models/falcon_h1/modeling_falcon_h1.py +32 -26
  255. transformers/models/falcon_h1/modular_falcon_h1.py +7 -2
  256. transformers/models/falcon_mamba/modeling_falcon_mamba.py +31 -37
  257. transformers/models/falcon_mamba/modular_falcon_mamba.py +19 -33
  258. transformers/models/fast_vlm/__init__.py +27 -0
  259. transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
  260. transformers/models/fast_vlm/modeling_fast_vlm.py +459 -0
  261. transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
  262. transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +31 -13
  263. transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -0
  264. transformers/models/flaubert/modeling_flaubert.py +21 -15
  265. transformers/models/flava/image_processing_flava_fast.py +0 -2
  266. transformers/models/flava/modeling_flava.py +10 -2
  267. transformers/models/flex_olmo/modeling_flex_olmo.py +10 -8
  268. transformers/models/florence2/modeling_florence2.py +22 -4
  269. transformers/models/florence2/modular_florence2.py +15 -1
  270. transformers/models/fnet/modeling_fnet.py +14 -0
  271. transformers/models/focalnet/modeling_focalnet.py +4 -0
  272. transformers/models/fsmt/modeling_fsmt.py +2 -0
  273. transformers/models/funnel/modeling_funnel.py +8 -0
  274. transformers/models/funnel/tokenization_funnel.py +17 -24
  275. transformers/models/fuyu/image_processing_fuyu.py +1 -1
  276. transformers/models/fuyu/modeling_fuyu.py +3 -1
  277. transformers/models/fuyu/processing_fuyu.py +19 -3
  278. transformers/models/gemma/modeling_gemma.py +14 -16
  279. transformers/models/gemma/modular_gemma.py +9 -11
  280. transformers/models/gemma/tokenization_gemma.py +10 -27
  281. transformers/models/gemma2/modeling_gemma2.py +5 -5
  282. transformers/models/gemma2/modular_gemma2.py +3 -2
  283. transformers/models/gemma3/image_processing_gemma3_fast.py +0 -1
  284. transformers/models/gemma3/modeling_gemma3.py +42 -91
  285. transformers/models/gemma3/modular_gemma3.py +38 -87
  286. transformers/models/gemma3n/configuration_gemma3n.py +3 -0
  287. transformers/models/gemma3n/modeling_gemma3n.py +65 -218
  288. transformers/models/gemma3n/modular_gemma3n.py +68 -68
  289. transformers/models/git/modeling_git.py +183 -126
  290. transformers/models/glm/modeling_glm.py +5 -5
  291. transformers/models/glm4/modeling_glm4.py +5 -5
  292. transformers/models/glm46v/image_processing_glm46v.py +0 -4
  293. transformers/models/glm46v/modeling_glm46v.py +3 -1
  294. transformers/models/glm46v/modular_glm46v.py +3 -0
  295. transformers/models/glm4_moe/modeling_glm4_moe.py +13 -7
  296. transformers/models/glm4_moe/modular_glm4_moe.py +1 -1
  297. transformers/models/glm4v/configuration_glm4v.py +3 -1
  298. transformers/models/glm4v/image_processing_glm4v.py +0 -4
  299. transformers/models/glm4v/modeling_glm4v.py +18 -8
  300. transformers/models/glm4v/modular_glm4v.py +17 -7
  301. transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
  302. transformers/models/glm4v_moe/modeling_glm4v_moe.py +44 -27
  303. transformers/models/glm4v_moe/modular_glm4v_moe.py +13 -1
  304. transformers/models/glmasr/__init__.py +30 -0
  305. transformers/models/glmasr/configuration_glmasr.py +197 -0
  306. transformers/models/glmasr/modeling_glmasr.py +512 -0
  307. transformers/models/glmasr/modular_glmasr.py +433 -0
  308. transformers/models/glmasr/processing_glmasr.py +332 -0
  309. transformers/models/glpn/image_processing_glpn_fast.py +0 -1
  310. transformers/models/glpn/modeling_glpn.py +2 -0
  311. transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +0 -1
  312. transformers/models/got_ocr2/modeling_got_ocr2.py +8 -3
  313. transformers/models/gpt2/modeling_gpt2.py +13 -6
  314. transformers/models/gpt2/tokenization_gpt2.py +16 -44
  315. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +4 -8
  316. transformers/models/gpt_neo/modeling_gpt_neo.py +19 -3
  317. transformers/models/gpt_neox/modeling_gpt_neox.py +6 -3
  318. transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
  319. transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
  320. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +4 -2
  321. transformers/models/gpt_oss/configuration_gpt_oss.py +17 -0
  322. transformers/models/gpt_oss/modeling_gpt_oss.py +10 -14
  323. transformers/models/gpt_oss/modular_gpt_oss.py +8 -12
  324. transformers/models/gptj/modeling_gptj.py +18 -6
  325. transformers/models/granite/modeling_granite.py +5 -5
  326. transformers/models/granite_speech/modeling_granite_speech.py +15 -1
  327. transformers/models/granitemoe/modeling_granitemoe.py +6 -9
  328. transformers/models/granitemoe/modular_granitemoe.py +1 -4
  329. transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +4 -0
  330. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +36 -28
  331. transformers/models/granitemoehybrid/modular_granitemoehybrid.py +12 -2
  332. transformers/models/granitemoeshared/modeling_granitemoeshared.py +6 -9
  333. transformers/models/grounding_dino/configuration_grounding_dino.py +2 -3
  334. transformers/models/grounding_dino/modeling_grounding_dino.py +8 -4
  335. transformers/models/groupvit/modeling_groupvit.py +9 -1
  336. transformers/models/helium/modeling_helium.py +5 -4
  337. transformers/models/herbert/tokenization_herbert.py +9 -25
  338. transformers/models/hgnet_v2/modeling_hgnet_v2.py +16 -1
  339. transformers/models/hgnet_v2/modular_hgnet_v2.py +16 -1
  340. transformers/models/hiera/modeling_hiera.py +4 -0
  341. transformers/models/hubert/modeling_hubert.py +7 -0
  342. transformers/models/hubert/modular_hubert.py +5 -0
  343. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +5 -5
  344. transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +1 -1
  345. transformers/models/hunyuan_v1_moe/__init__.py +1 -1
  346. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +15 -7
  347. transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +4 -2
  348. transformers/models/ibert/modeling_ibert.py +22 -0
  349. transformers/models/idefics/modeling_idefics.py +15 -21
  350. transformers/models/idefics2/modeling_idefics2.py +7 -1
  351. transformers/models/idefics3/modeling_idefics3.py +5 -1
  352. transformers/models/imagegpt/image_processing_imagegpt_fast.py +1 -5
  353. transformers/models/imagegpt/modeling_imagegpt.py +11 -3
  354. transformers/models/informer/modeling_informer.py +4 -0
  355. transformers/models/informer/modular_informer.py +1 -0
  356. transformers/models/instructblip/modeling_instructblip.py +2 -0
  357. transformers/models/instructblipvideo/modeling_instructblipvideo.py +52 -50
  358. transformers/models/instructblipvideo/video_processing_instructblipvideo.py +0 -1
  359. transformers/models/internvl/modeling_internvl.py +13 -12
  360. transformers/models/internvl/modular_internvl.py +7 -13
  361. transformers/models/internvl/video_processing_internvl.py +0 -1
  362. transformers/models/jais2/__init__.py +27 -0
  363. transformers/models/jais2/configuration_jais2.py +152 -0
  364. transformers/models/jais2/modeling_jais2.py +486 -0
  365. transformers/models/jais2/modular_jais2.py +196 -0
  366. transformers/models/jamba/modeling_jamba.py +25 -20
  367. transformers/models/jamba/modular_jamba.py +17 -17
  368. transformers/models/janus/image_processing_janus_fast.py +0 -1
  369. transformers/models/janus/modeling_janus.py +16 -7
  370. transformers/models/janus/modular_janus.py +17 -7
  371. transformers/models/jetmoe/modeling_jetmoe.py +4 -4
  372. transformers/models/jetmoe/modular_jetmoe.py +1 -0
  373. transformers/models/kosmos2/modeling_kosmos2.py +15 -2
  374. transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -2
  375. transformers/models/kosmos2_5/modeling_kosmos2_5.py +10 -1
  376. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +12 -4
  377. transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +9 -1
  378. transformers/models/lasr/__init__.py +29 -0
  379. transformers/models/lasr/configuration_lasr.py +248 -0
  380. transformers/models/lasr/feature_extraction_lasr.py +277 -0
  381. transformers/models/lasr/modeling_lasr.py +730 -0
  382. transformers/models/lasr/modular_lasr.py +576 -0
  383. transformers/models/lasr/processing_lasr.py +94 -0
  384. transformers/models/lasr/tokenization_lasr.py +186 -0
  385. transformers/models/layoutlm/modeling_layoutlm.py +10 -3
  386. transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +0 -1
  387. transformers/models/layoutlmv2/modeling_layoutlmv2.py +16 -0
  388. transformers/models/layoutlmv2/tokenization_layoutlmv2.py +11 -53
  389. transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +0 -1
  390. transformers/models/layoutlmv3/modeling_layoutlmv3.py +33 -5
  391. transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
  392. transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
  393. transformers/models/led/modeling_led.py +12 -0
  394. transformers/models/levit/modeling_levit.py +21 -0
  395. transformers/models/lfm2/modeling_lfm2.py +5 -6
  396. transformers/models/lfm2/modular_lfm2.py +0 -1
  397. transformers/models/lfm2_moe/modeling_lfm2_moe.py +17 -8
  398. transformers/models/lfm2_moe/modular_lfm2_moe.py +5 -28
  399. transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -0
  400. transformers/models/lfm2_vl/modeling_lfm2_vl.py +11 -5
  401. transformers/models/lfm2_vl/modular_lfm2_vl.py +4 -2
  402. transformers/models/lfm2_vl/processing_lfm2_vl.py +82 -42
  403. transformers/models/lightglue/image_processing_lightglue_fast.py +1 -2
  404. transformers/models/lightglue/modeling_lightglue.py +3 -1
  405. transformers/models/lightglue/modular_lightglue.py +1 -0
  406. transformers/models/lilt/modeling_lilt.py +23 -15
  407. transformers/models/llama/modeling_llama.py +5 -5
  408. transformers/models/llama/tokenization_llama.py +15 -43
  409. transformers/models/llama4/image_processing_llama4_fast.py +1 -2
  410. transformers/models/llama4/modeling_llama4.py +11 -6
  411. transformers/models/llava/image_processing_llava_fast.py +0 -1
  412. transformers/models/llava/modeling_llava.py +12 -7
  413. transformers/models/llava_next/image_processing_llava_next_fast.py +0 -1
  414. transformers/models/llava_next/modeling_llava_next.py +7 -3
  415. transformers/models/llava_next_video/modeling_llava_next_video.py +7 -3
  416. transformers/models/llava_next_video/modular_llava_next_video.py +7 -3
  417. transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +0 -1
  418. transformers/models/llava_onevision/modeling_llava_onevision.py +7 -3
  419. transformers/models/llava_onevision/modular_llava_onevision.py +7 -4
  420. transformers/models/longcat_flash/modeling_longcat_flash.py +6 -5
  421. transformers/models/longcat_flash/modular_longcat_flash.py +3 -2
  422. transformers/models/longformer/modeling_longformer.py +6 -0
  423. transformers/models/longt5/modeling_longt5.py +4 -4
  424. transformers/models/luke/modeling_luke.py +9 -0
  425. transformers/models/luke/tokenization_luke.py +11 -38
  426. transformers/models/lxmert/modeling_lxmert.py +2 -0
  427. transformers/models/m2m_100/modeling_m2m_100.py +14 -0
  428. transformers/models/mamba/modeling_mamba.py +16 -23
  429. transformers/models/mamba2/modeling_mamba2.py +24 -23
  430. transformers/models/marian/configuration_marian.py +1 -1
  431. transformers/models/marian/modeling_marian.py +8 -0
  432. transformers/models/markuplm/modeling_markuplm.py +9 -8
  433. transformers/models/markuplm/tokenization_markuplm.py +28 -61
  434. transformers/models/mask2former/configuration_mask2former.py +3 -3
  435. transformers/models/mask2former/image_processing_mask2former_fast.py +1 -4
  436. transformers/models/mask2former/modeling_mask2former.py +11 -0
  437. transformers/models/maskformer/configuration_maskformer.py +3 -3
  438. transformers/models/maskformer/image_processing_maskformer_fast.py +1 -4
  439. transformers/models/maskformer/modeling_maskformer.py +11 -1
  440. transformers/models/maskformer/modeling_maskformer_swin.py +21 -15
  441. transformers/models/mbart/configuration_mbart.py +1 -0
  442. transformers/models/mbart/modeling_mbart.py +14 -0
  443. transformers/models/mbart/tokenization_mbart.py +11 -52
  444. transformers/models/mbart50/tokenization_mbart50.py +7 -10
  445. transformers/models/megatron_bert/modeling_megatron_bert.py +9 -0
  446. transformers/models/metaclip_2/modeling_metaclip_2.py +2 -0
  447. transformers/models/metaclip_2/modular_metaclip_2.py +2 -0
  448. transformers/models/mgp_str/modeling_mgp_str.py +2 -0
  449. transformers/models/mimi/modeling_mimi.py +28 -5
  450. transformers/models/minimax/modeling_minimax.py +19 -6
  451. transformers/models/minimax/modular_minimax.py +12 -1
  452. transformers/models/ministral/modeling_ministral.py +5 -5
  453. transformers/models/ministral3/configuration_ministral3.py +1 -1
  454. transformers/models/ministral3/modeling_ministral3.py +5 -4
  455. transformers/models/mistral/modeling_mistral.py +5 -4
  456. transformers/models/mistral3/modeling_mistral3.py +10 -4
  457. transformers/models/mistral3/modular_mistral3.py +3 -1
  458. transformers/models/mixtral/modeling_mixtral.py +15 -7
  459. transformers/models/mixtral/modular_mixtral.py +6 -2
  460. transformers/models/mlcd/modeling_mlcd.py +6 -0
  461. transformers/models/mlcd/modular_mlcd.py +4 -0
  462. transformers/models/mllama/modeling_mllama.py +15 -4
  463. transformers/models/mluke/tokenization_mluke.py +6 -6
  464. transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -2
  465. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +8 -4
  466. transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -2
  467. transformers/models/mobilebert/modeling_mobilebert.py +2 -0
  468. transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
  469. transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +0 -1
  470. transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
  471. transformers/models/mobilevit/image_processing_mobilevit.py +5 -5
  472. transformers/models/mobilevit/image_processing_mobilevit_fast.py +1 -2
  473. transformers/models/mobilevit/modeling_mobilevit.py +7 -0
  474. transformers/models/mobilevitv2/modeling_mobilevitv2.py +7 -0
  475. transformers/models/modernbert/modeling_modernbert.py +16 -2
  476. transformers/models/modernbert/modular_modernbert.py +14 -1
  477. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +17 -10
  478. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +15 -8
  479. transformers/models/moonshine/modeling_moonshine.py +5 -3
  480. transformers/models/moshi/modeling_moshi.py +26 -53
  481. transformers/models/mpnet/modeling_mpnet.py +7 -0
  482. transformers/models/mpnet/tokenization_mpnet.py +5 -13
  483. transformers/models/mpt/modeling_mpt.py +2 -0
  484. transformers/models/mra/modeling_mra.py +10 -1
  485. transformers/models/mt5/configuration_mt5.py +2 -3
  486. transformers/models/mt5/modeling_mt5.py +7 -10
  487. transformers/models/musicgen/modeling_musicgen.py +7 -9
  488. transformers/models/musicgen_melody/modeling_musicgen_melody.py +7 -0
  489. transformers/models/mvp/modeling_mvp.py +14 -0
  490. transformers/models/nanochat/modeling_nanochat.py +5 -5
  491. transformers/models/nemotron/modeling_nemotron.py +7 -5
  492. transformers/models/nllb/tokenization_nllb.py +8 -22
  493. transformers/models/nllb_moe/configuration_nllb_moe.py +1 -0
  494. transformers/models/nllb_moe/modeling_nllb_moe.py +10 -0
  495. transformers/models/nougat/image_processing_nougat_fast.py +0 -1
  496. transformers/models/nougat/tokenization_nougat.py +15 -68
  497. transformers/models/nystromformer/modeling_nystromformer.py +13 -0
  498. transformers/models/olmo/modeling_olmo.py +5 -5
  499. transformers/models/olmo/modular_olmo.py +2 -2
  500. transformers/models/olmo2/modeling_olmo2.py +5 -6
  501. transformers/models/olmo2/modular_olmo2.py +0 -1
  502. transformers/models/olmo3/modeling_olmo3.py +5 -5
  503. transformers/models/olmoe/modeling_olmoe.py +15 -7
  504. transformers/models/olmoe/modular_olmoe.py +4 -2
  505. transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -2
  506. transformers/models/omdet_turbo/modeling_omdet_turbo.py +6 -0
  507. transformers/models/oneformer/configuration_oneformer.py +3 -3
  508. transformers/models/oneformer/modeling_oneformer.py +11 -39
  509. transformers/models/openai/modeling_openai.py +15 -0
  510. transformers/models/openai/tokenization_openai.py +10 -46
  511. transformers/models/opt/modeling_opt.py +2 -0
  512. transformers/models/ovis2/image_processing_ovis2_fast.py +0 -1
  513. transformers/models/ovis2/modeling_ovis2.py +15 -3
  514. transformers/models/ovis2/modular_ovis2.py +8 -0
  515. transformers/models/owlv2/image_processing_owlv2_fast.py +0 -2
  516. transformers/models/owlv2/modeling_owlv2.py +11 -3
  517. transformers/models/owlv2/modular_owlv2.py +0 -2
  518. transformers/models/owlvit/modeling_owlvit.py +11 -3
  519. transformers/models/paddleocr_vl/__init__.py +32 -0
  520. transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
  521. transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +504 -0
  522. transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
  523. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1682 -0
  524. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1359 -0
  525. transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
  526. transformers/models/paligemma/modeling_paligemma.py +25 -17
  527. transformers/models/parakeet/configuration_parakeet.py +4 -6
  528. transformers/models/parakeet/modeling_parakeet.py +14 -6
  529. transformers/models/parakeet/modular_parakeet.py +7 -2
  530. transformers/models/parakeet/processing_parakeet.py +1 -0
  531. transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +3 -3
  532. transformers/models/patchtsmixer/modeling_patchtsmixer.py +10 -0
  533. transformers/models/patchtst/modeling_patchtst.py +25 -6
  534. transformers/models/pe_audio/__init__.py +30 -0
  535. transformers/models/pe_audio/configuration_pe_audio.py +206 -0
  536. transformers/models/pe_audio/feature_extraction_pe_audio.py +162 -0
  537. transformers/models/pe_audio/modeling_pe_audio.py +820 -0
  538. transformers/models/pe_audio/modular_pe_audio.py +299 -0
  539. transformers/{kernels/falcon_mamba/__init__.py → models/pe_audio/processing_pe_audio.py} +11 -2
  540. transformers/models/pe_audio_video/__init__.py +29 -0
  541. transformers/models/pe_audio_video/configuration_pe_audio_video.py +225 -0
  542. transformers/models/pe_audio_video/modeling_pe_audio_video.py +972 -0
  543. transformers/models/pe_audio_video/modular_pe_audio_video.py +764 -0
  544. transformers/models/pe_audio_video/processing_pe_audio_video.py +25 -0
  545. transformers/models/pe_video/__init__.py +30 -0
  546. transformers/models/pe_video/configuration_pe_video.py +211 -0
  547. transformers/models/pe_video/modeling_pe_video.py +636 -0
  548. transformers/models/pe_video/modular_pe_video.py +219 -0
  549. transformers/models/pe_video/processing_pe_video.py +10 -0
  550. transformers/models/pe_video/video_processing_pe_video.py +66 -0
  551. transformers/models/pegasus/configuration_pegasus.py +1 -0
  552. transformers/models/pegasus/modeling_pegasus.py +8 -0
  553. transformers/models/pegasus/tokenization_pegasus.py +17 -44
  554. transformers/models/pegasus_x/modeling_pegasus_x.py +5 -0
  555. transformers/models/perceiver/image_processing_perceiver_fast.py +0 -1
  556. transformers/models/perceiver/modeling_perceiver.py +13 -1
  557. transformers/models/perception_lm/image_processing_perception_lm_fast.py +0 -1
  558. transformers/models/perception_lm/modeling_perception_lm.py +7 -3
  559. transformers/models/perception_lm/modular_perception_lm.py +7 -3
  560. transformers/models/persimmon/modeling_persimmon.py +3 -2
  561. transformers/models/phi/modeling_phi.py +5 -6
  562. transformers/models/phi/modular_phi.py +0 -1
  563. transformers/models/phi3/modeling_phi3.py +3 -2
  564. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +9 -6
  565. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +7 -4
  566. transformers/models/phi4_multimodal/processing_phi4_multimodal.py +0 -2
  567. transformers/models/phimoe/modeling_phimoe.py +15 -7
  568. transformers/models/phimoe/modular_phimoe.py +3 -3
  569. transformers/models/pix2struct/modeling_pix2struct.py +2 -0
  570. transformers/models/pix2struct/processing_pix2struct.py +0 -4
  571. transformers/models/pixio/__init__.py +30 -0
  572. transformers/models/pixio/configuration_pixio.py +151 -0
  573. transformers/models/pixio/modeling_pixio.py +507 -0
  574. transformers/models/pixio/modular_pixio.py +404 -0
  575. transformers/models/pixtral/modeling_pixtral.py +3 -2
  576. transformers/models/pixtral/processing_pixtral.py +3 -1
  577. transformers/models/plbart/configuration_plbart.py +1 -0
  578. transformers/models/plbart/modeling_plbart.py +13 -0
  579. transformers/models/plbart/modular_plbart.py +8 -0
  580. transformers/models/plbart/tokenization_plbart.py +0 -2
  581. transformers/models/poolformer/image_processing_poolformer_fast.py +0 -1
  582. transformers/models/poolformer/modeling_poolformer.py +13 -1
  583. transformers/models/pop2piano/configuration_pop2piano.py +0 -1
  584. transformers/models/pop2piano/modeling_pop2piano.py +2 -0
  585. transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +2 -3
  586. transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
  587. transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
  588. transformers/models/prophetnet/modeling_prophetnet.py +5 -1
  589. transformers/models/pvt/modeling_pvt.py +2 -0
  590. transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
  591. transformers/models/qwen2/modeling_qwen2.py +5 -5
  592. transformers/models/qwen2/tokenization_qwen2.py +14 -18
  593. transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
  594. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +116 -79
  595. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +71 -33
  596. transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
  597. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +23 -11
  598. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +29 -27
  599. transformers/models/qwen2_audio/modeling_qwen2_audio.py +4 -2
  600. transformers/models/qwen2_moe/modeling_qwen2_moe.py +15 -7
  601. transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
  602. transformers/models/qwen2_vl/image_processing_qwen2_vl.py +3 -2
  603. transformers/models/qwen2_vl/modeling_qwen2_vl.py +23 -20
  604. transformers/models/qwen3/modeling_qwen3.py +5 -5
  605. transformers/models/qwen3_moe/modeling_qwen3_moe.py +15 -7
  606. transformers/models/qwen3_next/modeling_qwen3_next.py +7 -8
  607. transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +4 -0
  608. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +112 -68
  609. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +62 -20
  610. transformers/models/qwen3_vl/configuration_qwen3_vl.py +5 -5
  611. transformers/models/qwen3_vl/modeling_qwen3_vl.py +57 -42
  612. transformers/models/qwen3_vl/modular_qwen3_vl.py +59 -46
  613. transformers/models/qwen3_vl/processing_qwen3_vl.py +3 -3
  614. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +132 -148
  615. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +36 -82
  616. transformers/models/rag/configuration_rag.py +0 -8
  617. transformers/models/rag/modeling_rag.py +8 -9
  618. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +18 -3
  619. transformers/models/reformer/modeling_reformer.py +13 -1
  620. transformers/models/reformer/tokenization_reformer.py +11 -28
  621. transformers/models/regnet/modeling_regnet.py +10 -1
  622. transformers/models/rembert/modeling_rembert.py +13 -1
  623. transformers/models/rembert/tokenization_rembert.py +3 -10
  624. transformers/models/resnet/modeling_resnet.py +19 -5
  625. transformers/models/roberta/modeling_roberta.py +3 -0
  626. transformers/models/roberta/modular_roberta.py +3 -0
  627. transformers/models/roberta/tokenization_roberta.py +18 -27
  628. transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +3 -0
  629. transformers/models/roc_bert/modeling_roc_bert.py +3 -0
  630. transformers/models/roformer/modeling_roformer.py +6 -0
  631. transformers/models/roformer/tokenization_roformer.py +77 -412
  632. transformers/models/rt_detr/configuration_rt_detr.py +1 -1
  633. transformers/models/rt_detr/modeling_rt_detr.py +6 -0
  634. transformers/models/rt_detr/modeling_rt_detr_resnet.py +13 -4
  635. transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -3
  636. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +9 -0
  637. transformers/models/rt_detr_v2/modular_rt_detr_v2.py +8 -3
  638. transformers/models/rwkv/modeling_rwkv.py +2 -1
  639. transformers/models/sam/configuration_sam.py +1 -0
  640. transformers/models/sam/image_processing_sam_fast.py +0 -1
  641. transformers/models/sam/modeling_sam.py +4 -1
  642. transformers/models/sam2/configuration_sam2.py +1 -1
  643. transformers/models/sam2/modeling_sam2.py +7 -3
  644. transformers/models/sam2/modular_sam2.py +7 -3
  645. transformers/models/sam2_video/modeling_sam2_video.py +52 -43
  646. transformers/models/sam2_video/modular_sam2_video.py +32 -18
  647. transformers/models/sam3/configuration_sam3.py +21 -1
  648. transformers/models/sam3/modeling_sam3.py +100 -80
  649. transformers/models/sam3_tracker/modeling_sam3_tracker.py +8 -1
  650. transformers/models/sam3_tracker/modular_sam3_tracker.py +8 -1
  651. transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -0
  652. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +27 -15
  653. transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +25 -2
  654. transformers/models/sam3_video/configuration_sam3_video.py +14 -0
  655. transformers/models/sam3_video/modeling_sam3_video.py +4 -3
  656. transformers/models/sam3_video/processing_sam3_video.py +1 -1
  657. transformers/models/sam_hq/configuration_sam_hq.py +1 -0
  658. transformers/models/sam_hq/modeling_sam_hq.py +26 -23
  659. transformers/models/seamless_m4t/modeling_seamless_m4t.py +32 -12
  660. transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
  661. transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +11 -1
  662. transformers/models/seed_oss/modeling_seed_oss.py +3 -3
  663. transformers/models/segformer/image_processing_segformer_fast.py +0 -1
  664. transformers/models/segformer/modeling_segformer.py +6 -3
  665. transformers/models/segformer/modular_segformer.py +0 -1
  666. transformers/models/seggpt/modeling_seggpt.py +2 -0
  667. transformers/models/sew/modeling_sew.py +3 -0
  668. transformers/models/sew/modular_sew.py +1 -0
  669. transformers/models/sew_d/modeling_sew_d.py +3 -0
  670. transformers/models/shieldgemma2/modeling_shieldgemma2.py +1 -0
  671. transformers/models/siglip/modeling_siglip.py +24 -2
  672. transformers/models/siglip2/modeling_siglip2.py +67 -41
  673. transformers/models/siglip2/modular_siglip2.py +4 -0
  674. transformers/models/smollm3/modeling_smollm3.py +5 -5
  675. transformers/models/smolvlm/modeling_smolvlm.py +5 -1
  676. transformers/models/smolvlm/processing_smolvlm.py +0 -7
  677. transformers/models/smolvlm/video_processing_smolvlm.py +0 -1
  678. transformers/models/speech_to_text/modeling_speech_to_text.py +14 -0
  679. transformers/models/speecht5/modeling_speecht5.py +41 -1
  680. transformers/models/splinter/modeling_splinter.py +12 -3
  681. transformers/models/splinter/tokenization_splinter.py +9 -28
  682. transformers/models/squeezebert/modeling_squeezebert.py +8 -0
  683. transformers/models/stablelm/modeling_stablelm.py +4 -2
  684. transformers/models/starcoder2/modeling_starcoder2.py +5 -4
  685. transformers/models/superglue/image_processing_superglue_fast.py +1 -2
  686. transformers/models/superglue/modeling_superglue.py +1 -0
  687. transformers/models/superpoint/image_processing_superpoint_fast.py +1 -2
  688. transformers/models/superpoint/modeling_superpoint.py +1 -0
  689. transformers/models/swiftformer/modeling_swiftformer.py +6 -0
  690. transformers/models/swin/modeling_swin.py +20 -12
  691. transformers/models/swin2sr/image_processing_swin2sr_fast.py +0 -1
  692. transformers/models/swin2sr/modeling_swin2sr.py +51 -33
  693. transformers/models/swinv2/modeling_swinv2.py +45 -33
  694. transformers/models/switch_transformers/modeling_switch_transformers.py +2 -8
  695. transformers/models/switch_transformers/modular_switch_transformers.py +2 -8
  696. transformers/models/t5/configuration_t5.py +7 -1
  697. transformers/models/t5/modeling_t5.py +8 -7
  698. transformers/models/t5/tokenization_t5.py +4 -8
  699. transformers/models/t5gemma/modeling_t5gemma.py +6 -6
  700. transformers/models/t5gemma2/configuration_t5gemma2.py +6 -42
  701. transformers/models/t5gemma2/modeling_t5gemma2.py +19 -10
  702. transformers/models/t5gemma2/modular_t5gemma2.py +289 -4
  703. transformers/models/table_transformer/configuration_table_transformer.py +1 -1
  704. transformers/models/table_transformer/modeling_table_transformer.py +5 -1
  705. transformers/models/tapas/modeling_tapas.py +3 -0
  706. transformers/models/textnet/image_processing_textnet_fast.py +0 -1
  707. transformers/models/textnet/modeling_textnet.py +11 -2
  708. transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
  709. transformers/models/timesfm/modeling_timesfm.py +14 -0
  710. transformers/models/timesfm/modular_timesfm.py +14 -0
  711. transformers/models/timesformer/modeling_timesformer.py +2 -0
  712. transformers/models/timm_backbone/modeling_timm_backbone.py +13 -9
  713. transformers/models/timm_wrapper/configuration_timm_wrapper.py +3 -0
  714. transformers/models/timm_wrapper/modeling_timm_wrapper.py +20 -14
  715. transformers/models/trocr/modeling_trocr.py +3 -2
  716. transformers/models/tvp/configuration_tvp.py +5 -1
  717. transformers/models/tvp/modeling_tvp.py +6 -4
  718. transformers/models/udop/configuration_udop.py +1 -0
  719. transformers/models/udop/modeling_udop.py +7 -7
  720. transformers/models/udop/tokenization_udop.py +5 -13
  721. transformers/models/umt5/configuration_umt5.py +2 -2
  722. transformers/models/umt5/modeling_umt5.py +7 -6
  723. transformers/models/unispeech/modeling_unispeech.py +4 -0
  724. transformers/models/unispeech/modular_unispeech.py +2 -0
  725. transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
  726. transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
  727. transformers/models/univnet/modeling_univnet.py +1 -0
  728. transformers/models/upernet/modeling_upernet.py +1 -0
  729. transformers/models/vaultgemma/modeling_vaultgemma.py +5 -5
  730. transformers/models/video_llama_3/image_processing_video_llama_3.py +3 -2
  731. transformers/models/video_llama_3/modeling_video_llama_3.py +12 -1
  732. transformers/models/video_llama_3/modular_video_llama_3.py +10 -1
  733. transformers/models/video_llava/modeling_video_llava.py +7 -3
  734. transformers/models/vilt/configuration_vilt.py +2 -2
  735. transformers/models/vilt/modeling_vilt.py +13 -0
  736. transformers/models/vipllava/modeling_vipllava.py +7 -3
  737. transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
  738. transformers/models/visual_bert/modeling_visual_bert.py +8 -0
  739. transformers/models/vitdet/modeling_vitdet.py +2 -0
  740. transformers/models/vitmatte/configuration_vitmatte.py +1 -1
  741. transformers/models/vitmatte/image_processing_vitmatte_fast.py +0 -1
  742. transformers/models/vitmatte/modeling_vitmatte.py +5 -0
  743. transformers/models/vitpose/configuration_vitpose.py +1 -1
  744. transformers/models/vitpose/image_processing_vitpose_fast.py +0 -1
  745. transformers/models/vits/modeling_vits.py +1 -0
  746. transformers/models/vjepa2/modeling_vjepa2.py +1 -0
  747. transformers/models/voxtral/modeling_voxtral.py +2 -2
  748. transformers/models/voxtral/modular_voxtral.py +2 -2
  749. transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
  750. transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +21 -10
  751. transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +12 -0
  752. transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +27 -11
  753. transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +21 -11
  754. transformers/models/wavlm/modeling_wavlm.py +5 -0
  755. transformers/models/whisper/generation_whisper.py +1 -0
  756. transformers/models/whisper/modeling_whisper.py +11 -3
  757. transformers/models/whisper/tokenization_whisper.py +4 -15
  758. transformers/models/x_clip/modeling_x_clip.py +5 -0
  759. transformers/models/xcodec/modeling_xcodec.py +5 -0
  760. transformers/models/xglm/modeling_xglm.py +11 -0
  761. transformers/models/xglm/tokenization_xglm.py +4 -9
  762. transformers/models/xlm/modeling_xlm.py +18 -14
  763. transformers/models/xlm_roberta/modeling_xlm_roberta.py +109 -106
  764. transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
  765. transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +3 -0
  766. transformers/models/xlnet/modeling_xlnet.py +3 -1
  767. transformers/models/xlnet/tokenization_xlnet.py +3 -7
  768. transformers/models/xmod/modeling_xmod.py +3 -0
  769. transformers/models/yoso/modeling_yoso.py +10 -1
  770. transformers/models/zamba/modeling_zamba.py +4 -1
  771. transformers/models/zamba2/modeling_zamba2.py +7 -4
  772. transformers/models/zamba2/modular_zamba2.py +1 -1
  773. transformers/models/zoedepth/configuration_zoedepth.py +1 -1
  774. transformers/models/zoedepth/image_processing_zoedepth_fast.py +1 -3
  775. transformers/models/zoedepth/modeling_zoedepth.py +8 -0
  776. transformers/pipelines/__init__.py +11 -9
  777. transformers/pipelines/automatic_speech_recognition.py +20 -12
  778. transformers/pipelines/base.py +2 -10
  779. transformers/pipelines/document_question_answering.py +4 -2
  780. transformers/pipelines/question_answering.py +1 -1
  781. transformers/pipelines/text_generation.py +1 -1
  782. transformers/pipelines/text_to_audio.py +2 -2
  783. transformers/processing_utils.py +133 -50
  784. transformers/quantizers/auto.py +2 -4
  785. transformers/quantizers/base.py +44 -174
  786. transformers/quantizers/quantizer_aqlm.py +2 -23
  787. transformers/quantizers/quantizer_auto_round.py +2 -12
  788. transformers/quantizers/quantizer_awq.py +20 -89
  789. transformers/quantizers/quantizer_bitnet.py +4 -14
  790. transformers/quantizers/quantizer_bnb_4bit.py +18 -155
  791. transformers/quantizers/quantizer_bnb_8bit.py +24 -110
  792. transformers/quantizers/quantizer_compressed_tensors.py +2 -9
  793. transformers/quantizers/quantizer_eetq.py +16 -74
  794. transformers/quantizers/quantizer_fbgemm_fp8.py +38 -138
  795. transformers/quantizers/quantizer_finegrained_fp8.py +26 -113
  796. transformers/quantizers/quantizer_fp_quant.py +52 -82
  797. transformers/quantizers/quantizer_gptq.py +8 -28
  798. transformers/quantizers/quantizer_higgs.py +42 -60
  799. transformers/quantizers/quantizer_hqq.py +144 -153
  800. transformers/quantizers/quantizer_mxfp4.py +14 -194
  801. transformers/quantizers/quantizer_quanto.py +35 -79
  802. transformers/quantizers/quantizer_quark.py +36 -17
  803. transformers/quantizers/quantizer_spqr.py +4 -12
  804. transformers/quantizers/quantizer_torchao.py +50 -325
  805. transformers/quantizers/quantizer_vptq.py +4 -27
  806. transformers/quantizers/quantizers_utils.py +20 -0
  807. transformers/testing_utils.py +324 -47
  808. transformers/tokenization_mistral_common.py +7 -2
  809. transformers/tokenization_utils_base.py +116 -224
  810. transformers/tokenization_utils_tokenizers.py +190 -106
  811. transformers/trainer.py +51 -32
  812. transformers/trainer_callback.py +8 -0
  813. transformers/trainer_jit_checkpoint.py +126 -0
  814. transformers/trainer_seq2seq.py +4 -0
  815. transformers/trainer_utils.py +1 -1
  816. transformers/training_args.py +74 -38
  817. transformers/utils/__init__.py +7 -4
  818. transformers/utils/attention_visualizer.py +4 -4
  819. transformers/utils/auto_docstring.py +35 -25
  820. transformers/utils/generic.py +47 -1
  821. transformers/utils/hub.py +5 -15
  822. transformers/utils/import_utils.py +112 -25
  823. transformers/utils/kernel_config.py +74 -19
  824. transformers/utils/loading_report.py +19 -10
  825. transformers/utils/quantization_config.py +78 -245
  826. transformers/video_processing_utils.py +17 -14
  827. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/METADATA +275 -229
  828. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/RECORD +832 -777
  829. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/WHEEL +1 -1
  830. transformers/kernels/__init__.py +0 -0
  831. transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
  832. transformers/models/roformer/tokenization_roformer_fast.py +0 -160
  833. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/entry_points.txt +0 -0
  834. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info/licenses}/LICENSE +0 -0
  835. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc2.dist-info}/top_level.txt +0 -0
@@ -30,7 +30,7 @@ from ... import initialization as init
30
30
  from ...activations import ACT2FN
31
31
  from ...cache_utils import Cache, DynamicCache
32
32
  from ...generation import GenerationMixin
33
- from ...integrations import use_kernel_forward_from_hub
33
+ from ...integrations import use_experts_implementation, use_kernel_forward_from_hub, use_kernelized_func
34
34
  from ...masking_utils import create_causal_mask
35
35
  from ...modeling_flash_attention_utils import FlashAttentionKwargs
36
36
  from ...modeling_layers import GradientCheckpointingLayer
@@ -38,8 +38,8 @@ from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
38
38
  from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
39
39
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
40
40
  from ...processing_utils import Unpack
41
- from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
42
- from ...utils.generic import check_model_inputs
41
+ from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_grouped_mm_available
42
+ from ...utils.generic import check_model_inputs, maybe_autocast
43
43
  from .configuration_glm4_moe import Glm4MoeConfig
44
44
 
45
45
 
@@ -60,7 +60,7 @@ class Glm4MoeRotaryEmbedding(nn.Module):
60
60
  inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
61
61
 
62
62
  self.register_buffer("inv_freq", inv_freq, persistent=False)
63
- self.original_inv_freq = inv_freq
63
+ self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
64
64
 
65
65
  @staticmethod
66
66
  def compute_default_rope_parameters(
@@ -101,7 +101,7 @@ class Glm4MoeRotaryEmbedding(nn.Module):
101
101
  position_ids_expanded = position_ids[:, None, :].float()
102
102
 
103
103
  device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
104
- with torch.autocast(device_type=device_type, enabled=False): # Force float32
104
+ with maybe_autocast(device_type=device_type, enabled=False): # Force float32
105
105
  freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
106
106
  emb = torch.cat((freqs, freqs), dim=-1)
107
107
  cos = emb.cos() * self.attention_scaling
@@ -193,6 +193,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
193
193
  return q_embed, k_embed
194
194
 
195
195
 
196
+ @use_kernelized_func(apply_rotary_pos_emb)
196
197
  class Glm4MoeAttention(nn.Module):
197
198
  """Multi-headed attention from 'Attention Is All You Need' paper"""
198
199
 
@@ -331,6 +332,7 @@ class Glm4MoeRMSNorm(nn.Module):
331
332
  return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
332
333
 
333
334
 
335
+ @use_experts_implementation
334
336
  class Glm4MoeNaiveMoe(nn.Module):
335
337
  """Collection of expert weights stored as 3D tensors."""
336
338
 
@@ -338,7 +340,7 @@ class Glm4MoeNaiveMoe(nn.Module):
338
340
  super().__init__()
339
341
  self.num_experts = config.num_local_experts
340
342
  self.hidden_dim = config.hidden_size
341
- self.intermediate_dim = config.intermediate_size
343
+ self.intermediate_dim = config.moe_intermediate_size
342
344
  self.gate_up_proj = nn.Parameter(torch.empty(self.num_experts, 2 * self.intermediate_dim, self.hidden_dim))
343
345
  self.down_proj = nn.Parameter(torch.empty(self.num_experts, self.hidden_dim, self.intermediate_dim))
344
346
  self.act_fn = ACT2FN[config.hidden_act]
@@ -485,18 +487,22 @@ class Glm4MoePreTrainedModel(PreTrainedModel):
485
487
  _supports_flash_attn = True
486
488
  _supports_sdpa = True
487
489
  _supports_flex_attn = True
488
- _can_compile_fullgraph = False
490
+ _can_compile_fullgraph = (
491
+ is_grouped_mm_available()
492
+ ) # https://huggingface.co/docs/transformers/experts_interface#torchcompile
489
493
  _supports_attention_backend = True
490
494
  _can_record_outputs = {
491
495
  "hidden_states": Glm4MoeDecoderLayer,
492
496
  "attentions": Glm4MoeAttention,
493
497
  }
498
+ _keep_in_fp32_modules_strict = ["e_score_correction_bias"]
494
499
 
495
500
  @torch.no_grad()
496
501
  def _init_weights(self, module):
497
502
  super()._init_weights(module)
498
503
  if isinstance(module, Glm4MoeTopkRouter):
499
504
  init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
505
+ init.zeros_(module.e_score_correction_bias)
500
506
  elif isinstance(module, Glm4MoeNaiveMoe):
501
507
  init.normal_(module.gate_up_proj, mean=0.0, std=self.config.initializer_range)
502
508
  init.normal_(module.down_proj, mean=0.0, std=self.config.initializer_range)
@@ -273,7 +273,7 @@ class Glm4MoeDecoderLayer(DeepseekV3DecoderLayer):
273
273
 
274
274
 
275
275
  class Glm4MoePreTrainedModel(DeepseekV3PreTrainedModel):
276
- _can_compile_fullgraph = False
276
+ pass
277
277
 
278
278
 
279
279
  class Glm4MoeModel(DeepseekV3Model):
@@ -234,7 +234,9 @@ class Glm4vTextConfig(PreTrainedConfig):
234
234
  self.attention_dropout = attention_dropout
235
235
  self.rope_parameters = rope_parameters
236
236
 
237
- super().__init__(tie_word_embeddings=tie_word_embeddings, ignore_keys_at_rope_validation={"mrope"}, **kwargs)
237
+ super().__init__(
238
+ tie_word_embeddings=tie_word_embeddings, ignore_keys_at_rope_validation={"mrope_section"}, **kwargs
239
+ )
238
240
 
239
241
 
240
242
  class Glm4vConfig(PreTrainedConfig):
@@ -353,7 +353,6 @@ class Glm4vImageProcessor(BaseImageProcessor):
353
353
  image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
354
354
  Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
355
355
  `True`.
356
- The max pixels of the image to resize the image.
357
356
  patch_size (`int`, *optional*, defaults to `self.patch_size`):
358
357
  The spatial patch size of the vision encoder.
359
358
  temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`):
@@ -380,12 +379,9 @@ class Glm4vImageProcessor(BaseImageProcessor):
380
379
  - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
381
380
 
382
381
  """
383
- # Try to use config values if set, otherwise fallback to global defaults
384
382
  size = size if size is not None else self.size
385
383
  if size is not None and ("shortest_edge" not in size or "longest_edge" not in size):
386
384
  raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
387
- elif size is None:
388
- size = {"shortest_edge": 112 * 112, "longest_edge": 28 * 28 * 15000}
389
385
 
390
386
  do_resize = do_resize if do_resize is not None else self.do_resize
391
387
  resample = resample if resample is not None else self.resample
@@ -28,6 +28,7 @@ import torch.nn as nn
28
28
  import torch.nn.functional as F
29
29
  from torch.nn import LayerNorm
30
30
 
31
+ from ... import initialization as init
31
32
  from ...activations import ACT2FN
32
33
  from ...cache_utils import Cache, DynamicCache
33
34
  from ...generation import GenerationMixin
@@ -40,7 +41,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
40
41
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
41
42
  from ...processing_utils import Unpack
42
43
  from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torchdynamo_compiling
43
- from ...utils.generic import check_model_inputs
44
+ from ...utils.generic import check_model_inputs, maybe_autocast
44
45
  from .configuration_glm4v import Glm4vConfig, Glm4vTextConfig, Glm4vVisionConfig
45
46
 
46
47
 
@@ -104,6 +105,8 @@ class Glm4vVisionRotaryEmbedding(nn.Module):
104
105
 
105
106
  def __init__(self, dim: int, theta: float = 10000.0) -> None:
106
107
  super().__init__()
108
+ self.dim = dim
109
+ self.theta = theta
107
110
  inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
108
111
  self.register_buffer("inv_freq", inv_freq, persistent=False)
109
112
 
@@ -141,7 +144,6 @@ class Glm4vVisionEmbeddings(nn.Module):
141
144
  self.num_patches = (self.image_size // self.patch_size) ** 2
142
145
  self.num_positions = self.num_patches
143
146
  self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
144
- self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
145
147
 
146
148
  def forward(self, embeddings, lengths, image_shapes, h_coords, w_coords) -> torch.Tensor:
147
149
  """
@@ -313,8 +315,8 @@ class Glm4vVisionAttention(nn.Module):
313
315
  if self.config._attn_implementation != "eager":
314
316
  attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
315
317
 
316
- if self.config._attn_implementation == "flash_attention_2":
317
- # Flash Attention 2: Use cu_seqlens for variable length attention
318
+ if "flash" in self.config._attn_implementation:
319
+ # Flash Attention: Use cu_seqlens for variable length attention
318
320
  max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
319
321
  attn_output, _ = attention_interface(
320
322
  self,
@@ -403,7 +405,7 @@ class Glm4vTextRotaryEmbedding(nn.Module):
403
405
  inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
404
406
 
405
407
  self.register_buffer("inv_freq", inv_freq, persistent=False)
406
- self.original_inv_freq = inv_freq
408
+ self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
407
409
 
408
410
  @staticmethod
409
411
  def compute_default_rope_parameters(
@@ -446,7 +448,7 @@ class Glm4vTextRotaryEmbedding(nn.Module):
446
448
  position_ids_expanded = position_ids[:, :, None, :].float() # shape (3, bs, 1, positions)
447
449
 
448
450
  device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
449
- with torch.autocast(device_type=device_type, enabled=False): # Force float32
451
+ with maybe_autocast(device_type=device_type, enabled=False): # Force float32
450
452
  freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(2, 3)
451
453
  emb = torch.cat((freqs, freqs), dim=-1)
452
454
  cos = emb.cos() * self.attention_scaling
@@ -705,6 +707,12 @@ class Glm4vPreTrainedModel(PreTrainedModel):
705
707
  "attentions": Glm4vTextAttention,
706
708
  }
707
709
 
710
+ def _init_weights(self, module):
711
+ super()._init_weights(module)
712
+ if isinstance(module, Glm4vVisionRotaryEmbedding):
713
+ inv_freq = 1.0 / (module.theta ** (torch.arange(0, module.dim, 2, dtype=torch.float) / module.dim))
714
+ init.copy_(module.inv_freq, inv_freq)
715
+
708
716
 
709
717
  class Glm4vVisionModel(Glm4vPreTrainedModel):
710
718
  config: Glm4vVisionConfig
@@ -768,7 +776,7 @@ class Glm4vVisionModel(Glm4vPreTrainedModel):
768
776
  rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
769
777
  return rotary_pos_emb, pos_ids
770
778
 
771
- def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor) -> torch.Tensor:
779
+ def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs) -> torch.Tensor:
772
780
  """
773
781
  Args:
774
782
  hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`):
@@ -1487,6 +1495,7 @@ class Glm4vForConditionalGeneration(Glm4vPreTrainedModel, GenerationMixin):
1487
1495
  pixel_values_videos=None,
1488
1496
  image_grid_thw=None,
1489
1497
  video_grid_thw=None,
1498
+ is_first_iteration=False,
1490
1499
  **kwargs,
1491
1500
  ):
1492
1501
  # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
@@ -1503,13 +1512,14 @@ class Glm4vForConditionalGeneration(Glm4vPreTrainedModel, GenerationMixin):
1503
1512
  image_grid_thw=image_grid_thw,
1504
1513
  video_grid_thw=video_grid_thw,
1505
1514
  use_cache=use_cache,
1515
+ is_first_iteration=is_first_iteration,
1506
1516
  **kwargs,
1507
1517
  )
1508
1518
 
1509
1519
  # GLM-4.1V position_ids are prepareed with rope_deltas in forward
1510
1520
  model_inputs["position_ids"] = None
1511
1521
 
1512
- if cache_position[0] != 0:
1522
+ if not is_first_iteration and use_cache:
1513
1523
  model_inputs["pixel_values"] = None
1514
1524
  model_inputs["pixel_values_videos"] = None
1515
1525
 
@@ -22,6 +22,7 @@ import torch.nn as nn
22
22
  import torch.nn.functional as F
23
23
  from torch.nn import LayerNorm
24
24
 
25
+ from ... import initialization as init
25
26
  from ...activations import ACT2FN
26
27
  from ...cache_utils import Cache, DynamicCache
27
28
  from ...configuration_utils import PreTrainedConfig
@@ -32,11 +33,11 @@ from ...modeling_flash_attention_utils import FlashAttentionKwargs
32
33
  from ...modeling_layers import GradientCheckpointingLayer
33
34
  from ...modeling_outputs import BaseModelOutputWithPast
34
35
  from ...modeling_rope_utils import RopeParameters
35
- from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
36
+ from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
36
37
  from ...processing_utils import Unpack
37
38
  from ...tokenization_utils_base import PreTokenizedInput, TextInput
38
39
  from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torchdynamo_compiling, logging
39
- from ...utils.generic import check_model_inputs
40
+ from ...utils.generic import check_model_inputs, maybe_autocast
40
41
  from ...video_utils import VideoInput
41
42
  from ..glm4.modeling_glm4 import Glm4MLP, Glm4RMSNorm, Glm4RotaryEmbedding, eager_attention_forward
42
43
  from ..qwen2_5_vl.modeling_qwen2_5_vl import (
@@ -271,7 +272,9 @@ class Glm4vTextConfig(PreTrainedConfig):
271
272
  self.attention_dropout = attention_dropout
272
273
  self.rope_parameters = rope_parameters
273
274
 
274
- super().__init__(tie_word_embeddings=tie_word_embeddings, ignore_keys_at_rope_validation={"mrope"}, **kwargs)
275
+ super().__init__(
276
+ tie_word_embeddings=tie_word_embeddings, ignore_keys_at_rope_validation={"mrope_section"}, **kwargs
277
+ )
275
278
 
276
279
 
277
280
  class Glm4vConfig(PreTrainedConfig):
@@ -407,7 +410,6 @@ class Glm4vVisionEmbeddings(nn.Module):
407
410
  self.num_patches = (self.image_size // self.patch_size) ** 2
408
411
  self.num_positions = self.num_patches
409
412
  self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
410
- self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
411
413
 
412
414
  def forward(self, embeddings, lengths, image_shapes, h_coords, w_coords) -> torch.Tensor:
413
415
  """
@@ -509,7 +511,7 @@ class Glm4vTextRotaryEmbedding(Glm4RotaryEmbedding):
509
511
  position_ids_expanded = position_ids[:, :, None, :].float() # shape (3, bs, 1, positions)
510
512
 
511
513
  device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
512
- with torch.autocast(device_type=device_type, enabled=False): # Force float32
514
+ with maybe_autocast(device_type=device_type, enabled=False): # Force float32
513
515
  freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(2, 3)
514
516
  emb = torch.cat((freqs, freqs), dim=-1)
515
517
  cos = emb.cos() * self.attention_scaling
@@ -723,6 +725,12 @@ class Glm4vPreTrainedModel(Qwen2_5_VLPreTrainedModel):
723
725
  "attentions": Glm4vTextAttention,
724
726
  }
725
727
 
728
+ def _init_weights(self, module):
729
+ PreTrainedModel._init_weights(self, module)
730
+ if isinstance(module, Glm4vVisionRotaryEmbedding):
731
+ inv_freq = 1.0 / (module.theta ** (torch.arange(0, module.dim, 2, dtype=torch.float) / module.dim))
732
+ init.copy_(module.inv_freq, inv_freq)
733
+
726
734
 
727
735
  class Glm4vVisionModel(Glm4vPreTrainedModel):
728
736
  config: Glm4vVisionConfig
@@ -786,7 +794,7 @@ class Glm4vVisionModel(Glm4vPreTrainedModel):
786
794
  rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
787
795
  return rotary_pos_emb, pos_ids
788
796
 
789
- def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor) -> torch.Tensor:
797
+ def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs) -> torch.Tensor:
790
798
  """
791
799
  Args:
792
800
  hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`):
@@ -1412,6 +1420,7 @@ class Glm4vForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
1412
1420
  pixel_values_videos=None,
1413
1421
  image_grid_thw=None,
1414
1422
  video_grid_thw=None,
1423
+ is_first_iteration=False,
1415
1424
  **kwargs,
1416
1425
  ):
1417
1426
  # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
@@ -1428,13 +1437,14 @@ class Glm4vForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
1428
1437
  image_grid_thw=image_grid_thw,
1429
1438
  video_grid_thw=video_grid_thw,
1430
1439
  use_cache=use_cache,
1440
+ is_first_iteration=is_first_iteration,
1431
1441
  **kwargs,
1432
1442
  )
1433
1443
 
1434
1444
  # GLM-4.1V position_ids are prepareed with rope_deltas in forward
1435
1445
  model_inputs["position_ids"] = None
1436
1446
 
1437
- if cache_position[0] != 0:
1447
+ if not is_first_iteration and use_cache:
1438
1448
  model_inputs["pixel_values"] = None
1439
1449
  model_inputs["pixel_values_videos"] = None
1440
1450
 
@@ -280,7 +280,9 @@ class Glm4vMoeTextConfig(PreTrainedConfig):
280
280
  self.first_k_dense_replace = first_k_dense_replace
281
281
  self.norm_topk_prob = norm_topk_prob
282
282
  self.router_aux_loss_coef = router_aux_loss_coef
283
- super().__init__(tie_word_embeddings=tie_word_embeddings, ignore_keys_at_rope_validation={"mrope"}, **kwargs)
283
+ super().__init__(
284
+ tie_word_embeddings=tie_word_embeddings, ignore_keys_at_rope_validation={"mrope_section"}, **kwargs
285
+ )
284
286
 
285
287
 
286
288
  class Glm4vMoeConfig(PreTrainedConfig):
@@ -32,7 +32,7 @@ from ... import initialization as init
32
32
  from ...activations import ACT2FN
33
33
  from ...cache_utils import Cache, DynamicCache
34
34
  from ...generation import GenerationMixin
35
- from ...integrations import use_kernel_forward_from_hub
35
+ from ...integrations import use_experts_implementation, use_kernel_forward_from_hub, use_kernelized_func
36
36
  from ...masking_utils import create_causal_mask
37
37
  from ...modeling_flash_attention_utils import FlashAttentionKwargs
38
38
  from ...modeling_layers import GradientCheckpointingLayer
@@ -40,8 +40,14 @@ from ...modeling_outputs import ModelOutput, MoeModelOutputWithPast
40
40
  from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
41
41
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
42
42
  from ...processing_utils import Unpack
43
- from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torchdynamo_compiling
44
- from ...utils.generic import OutputRecorder, check_model_inputs
43
+ from ...utils import (
44
+ TransformersKwargs,
45
+ auto_docstring,
46
+ can_return_tuple,
47
+ is_grouped_mm_available,
48
+ is_torchdynamo_compiling,
49
+ )
50
+ from ...utils.generic import OutputRecorder, check_model_inputs, maybe_autocast
45
51
  from .configuration_glm4v_moe import Glm4vMoeConfig, Glm4vMoeTextConfig, Glm4vMoeVisionConfig
46
52
 
47
53
 
@@ -107,7 +113,7 @@ class Glm4vMoeTextRotaryEmbedding(nn.Module):
107
113
  inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
108
114
 
109
115
  self.register_buffer("inv_freq", inv_freq, persistent=False)
110
- self.original_inv_freq = inv_freq
116
+ self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
111
117
 
112
118
  @staticmethod
113
119
  def compute_default_rope_parameters(
@@ -150,7 +156,7 @@ class Glm4vMoeTextRotaryEmbedding(nn.Module):
150
156
  position_ids_expanded = position_ids[:, :, None, :].float() # shape (3, bs, 1, positions)
151
157
 
152
158
  device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
153
- with torch.autocast(device_type=device_type, enabled=False): # Force float32
159
+ with maybe_autocast(device_type=device_type, enabled=False): # Force float32
154
160
  freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(2, 3)
155
161
  emb = torch.cat((freqs, freqs), dim=-1)
156
162
  cos = emb.cos() * self.attention_scaling
@@ -299,6 +305,7 @@ def apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqueeze_dim
299
305
  return q_embed, k_embed
300
306
 
301
307
 
308
+ @use_kernelized_func(apply_rotary_pos_emb)
302
309
  class Glm4vMoeTextAttention(nn.Module):
303
310
  """Multi-headed attention from 'Attention Is All You Need' paper"""
304
311
 
@@ -322,7 +329,6 @@ class Glm4vMoeTextAttention(nn.Module):
322
329
  config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
323
330
  )
324
331
  self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False)
325
- self.rotary_fn = apply_rotary_pos_emb
326
332
  self.rope_parameters = config.rope_parameters
327
333
 
328
334
  def forward(
@@ -395,6 +401,7 @@ class Glm4vMoeTextTopkRouter(nn.Module):
395
401
  return router_logits
396
402
 
397
403
 
404
+ @use_experts_implementation
398
405
  class Glm4vMoeTextNaiveMoe(nn.Module):
399
406
  """Collection of expert weights stored as 3D tensors."""
400
407
 
@@ -402,7 +409,7 @@ class Glm4vMoeTextNaiveMoe(nn.Module):
402
409
  super().__init__()
403
410
  self.num_experts = config.num_local_experts
404
411
  self.hidden_dim = config.hidden_size
405
- self.intermediate_dim = config.intermediate_size
412
+ self.intermediate_dim = config.moe_intermediate_size
406
413
  self.gate_up_proj = nn.Parameter(torch.empty(self.num_experts, 2 * self.intermediate_dim, self.hidden_dim))
407
414
  self.down_proj = nn.Parameter(torch.empty(self.num_experts, self.hidden_dim, self.intermediate_dim))
408
415
  self.act_fn = ACT2FN[config.hidden_act]
@@ -586,7 +593,9 @@ class Glm4vMoePreTrainedModel(PreTrainedModel):
586
593
  _supports_flash_attn = True
587
594
  _supports_sdpa = True
588
595
  _supports_flex_attn = True
589
- _can_compile_fullgraph = False
596
+ _can_compile_fullgraph = (
597
+ is_grouped_mm_available()
598
+ ) # https://huggingface.co/docs/transformers/experts_interface#torchcompile
590
599
  _supports_attention_backend = True
591
600
 
592
601
  _can_record_outputs = {
@@ -594,6 +603,7 @@ class Glm4vMoePreTrainedModel(PreTrainedModel):
594
603
  "attentions": Glm4vMoeTextAttention,
595
604
  "router_logits": OutputRecorder(nn.Linear, layer_name="mlp.gate", index=0),
596
605
  }
606
+ _keep_in_fp32_modules_strict = ["e_score_correction_bias"]
597
607
  input_modalities = ("text", "image", "video")
598
608
 
599
609
  @torch.no_grad()
@@ -601,9 +611,13 @@ class Glm4vMoePreTrainedModel(PreTrainedModel):
601
611
  super()._init_weights(module)
602
612
  if isinstance(module, Glm4vMoeTextTopkRouter):
603
613
  init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
614
+ init.zeros_(module.e_score_correction_bias)
604
615
  elif isinstance(module, Glm4vMoeTextNaiveMoe):
605
616
  init.normal_(module.gate_up_proj, mean=0.0, std=self.config.initializer_range)
606
617
  init.normal_(module.down_proj, mean=0.0, std=self.config.initializer_range)
618
+ if isinstance(module, Glm4vMoeVisionRotaryEmbedding):
619
+ inv_freq = 1.0 / (module.theta ** (torch.arange(0, module.dim, 2, dtype=torch.float) / module.dim))
620
+ init.copy_(module.inv_freq, inv_freq)
607
621
 
608
622
 
609
623
  @dataclass
@@ -636,6 +650,22 @@ class Glm4vMoeCausalLMOutputWithPast(ModelOutput):
636
650
  aux_loss: Optional[torch.FloatTensor] = None
637
651
 
638
652
 
653
+ class Glm4vMoeVisionRotaryEmbedding(nn.Module):
654
+ inv_freq: torch.Tensor # fix linting for `register_buffer`
655
+
656
+ def __init__(self, dim: int, theta: float = 10000.0) -> None:
657
+ super().__init__()
658
+ self.dim = dim
659
+ self.theta = theta
660
+ inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
661
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
662
+
663
+ def forward(self, seqlen: int) -> torch.Tensor:
664
+ seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
665
+ freqs = torch.outer(seq, self.inv_freq)
666
+ return freqs
667
+
668
+
639
669
  class Glm4vMoeisionMlp(nn.Module):
640
670
  def __init__(self, config, bias: bool = False):
641
671
  super().__init__()
@@ -670,20 +700,6 @@ class Glm4vMoeVisionPatchEmbed(nn.Module):
670
700
  return hidden_states
671
701
 
672
702
 
673
- class Glm4vMoeVisionRotaryEmbedding(nn.Module):
674
- inv_freq: torch.Tensor # fix linting for `register_buffer`
675
-
676
- def __init__(self, dim: int, theta: float = 10000.0) -> None:
677
- super().__init__()
678
- inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
679
- self.register_buffer("inv_freq", inv_freq, persistent=False)
680
-
681
- def forward(self, seqlen: int) -> torch.Tensor:
682
- seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
683
- freqs = torch.outer(seq, self.inv_freq)
684
- return freqs
685
-
686
-
687
703
  class Glm4vMoeVisionPatchMerger(nn.Module):
688
704
  def __init__(self, dim: int, context_dim: int, hidden_act: str, bias: bool = False) -> None:
689
705
  super().__init__()
@@ -712,7 +728,6 @@ class Glm4vMoeVisionEmbeddings(nn.Module):
712
728
  self.num_patches = (self.image_size // self.patch_size) ** 2
713
729
  self.num_positions = self.num_patches
714
730
  self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
715
- self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
716
731
 
717
732
  def forward(self, embeddings, lengths, image_shapes, h_coords, w_coords) -> torch.Tensor:
718
733
  """
@@ -839,8 +854,8 @@ class Glm4vMoeVisionAttention(nn.Module):
839
854
  if self.config._attn_implementation != "eager":
840
855
  attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
841
856
 
842
- if self.config._attn_implementation == "flash_attention_2":
843
- # Flash Attention 2: Use cu_seqlens for variable length attention
857
+ if "flash" in self.config._attn_implementation:
858
+ # Flash Attention: Use cu_seqlens for variable length attention
844
859
  max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
845
860
  attn_output, _ = attention_interface(
846
861
  self,
@@ -975,7 +990,7 @@ class Glm4vMoeVisionModel(Glm4vMoePreTrainedModel):
975
990
  rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
976
991
  return rotary_pos_emb, pos_ids
977
992
 
978
- def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor) -> torch.Tensor:
993
+ def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs) -> torch.Tensor:
979
994
  """
980
995
  Args:
981
996
  hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`):
@@ -1762,6 +1777,7 @@ class Glm4vMoeForConditionalGeneration(Glm4vMoePreTrainedModel, GenerationMixin)
1762
1777
  pixel_values_videos=None,
1763
1778
  image_grid_thw=None,
1764
1779
  video_grid_thw=None,
1780
+ is_first_iteration=False,
1765
1781
  **kwargs,
1766
1782
  ):
1767
1783
  # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
@@ -1778,13 +1794,14 @@ class Glm4vMoeForConditionalGeneration(Glm4vMoePreTrainedModel, GenerationMixin)
1778
1794
  image_grid_thw=image_grid_thw,
1779
1795
  video_grid_thw=video_grid_thw,
1780
1796
  use_cache=use_cache,
1797
+ is_first_iteration=is_first_iteration,
1781
1798
  **kwargs,
1782
1799
  )
1783
1800
 
1784
1801
  # GLM-4.1V position_ids are prepareed with rope_deltas in forward
1785
1802
  model_inputs["position_ids"] = None
1786
1803
 
1787
- if cache_position[0] != 0:
1804
+ if not is_first_iteration and use_cache:
1788
1805
  model_inputs["pixel_values"] = None
1789
1806
  model_inputs["pixel_values_videos"] = None
1790
1807
 
@@ -18,6 +18,7 @@ from typing import Optional, Union
18
18
  import torch
19
19
  import torch.nn as nn
20
20
 
21
+ from ... import initialization as init
21
22
  from ...cache_utils import Cache, DynamicCache
22
23
  from ...configuration_utils import PreTrainedConfig
23
24
  from ...masking_utils import create_causal_mask
@@ -46,6 +47,7 @@ from ..glm4v.modeling_glm4v import (
46
47
  Glm4vTextModel,
47
48
  Glm4vTextRotaryEmbedding,
48
49
  Glm4vVisionModel,
50
+ Glm4vVisionRotaryEmbedding,
49
51
  rotate_half,
50
52
  )
51
53
  from ..qwen3_vl_moe.modeling_qwen3_vl_moe import (
@@ -227,7 +229,7 @@ class Glm4vMoeTextConfig(Glm4MoeConfig, RotaryEmbeddingConfigMixin):
227
229
  self.norm_topk_prob = norm_topk_prob
228
230
  self.router_aux_loss_coef = router_aux_loss_coef
229
231
  PreTrainedConfig.__init__(
230
- self, tie_word_embeddings=tie_word_embeddings, ignore_keys_at_rope_validation={"mrope"}, **kwargs
232
+ self, tie_word_embeddings=tie_word_embeddings, ignore_keys_at_rope_validation={"mrope_section"}, **kwargs
231
233
  )
232
234
 
233
235
 
@@ -479,11 +481,21 @@ class Glm4vMoePreTrainedModel(Glm4MoePreTrainedModel):
479
481
  "router_logits": OutputRecorder(nn.Linear, layer_name="mlp.gate", index=0),
480
482
  }
481
483
 
484
+ def _init_weights(self, module):
485
+ super()._init_weights(module)
486
+ if isinstance(module, Glm4vMoeVisionRotaryEmbedding):
487
+ inv_freq = 1.0 / (module.theta ** (torch.arange(0, module.dim, 2, dtype=torch.float) / module.dim))
488
+ init.copy_(module.inv_freq, inv_freq)
489
+
482
490
 
483
491
  class Glm4vMoeCausalLMOutputWithPast(Qwen3VLMoeCausalLMOutputWithPast):
484
492
  pass
485
493
 
486
494
 
495
+ class Glm4vMoeVisionRotaryEmbedding(Glm4vVisionRotaryEmbedding):
496
+ pass
497
+
498
+
487
499
  @auto_docstring
488
500
  class Glm4vMoeVisionModel(Glm4vVisionModel):
489
501
  pass
@@ -0,0 +1,30 @@
1
+ # coding=utf-8
2
+ # Copyright 2025 the HuggingFace Team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from typing import TYPE_CHECKING
17
+
18
+ from ...utils import _LazyModule
19
+ from ...utils.import_utils import define_import_structure
20
+
21
+
22
+ if TYPE_CHECKING:
23
+ from .configuration_glmasr import *
24
+ from .modeling_glmasr import *
25
+ from .processing_glmasr import *
26
+ else:
27
+ import sys
28
+
29
+ _file = globals()["__file__"]
30
+ sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)