transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (539) hide show
  1. transformers/__init__.py +30 -3
  2. transformers/cli/serve.py +47 -17
  3. transformers/conversion_mapping.py +15 -2
  4. transformers/convert_slow_tokenizer.py +225 -10
  5. transformers/core_model_loading.py +196 -135
  6. transformers/data/data_collator.py +12 -4
  7. transformers/dependency_versions_table.py +1 -2
  8. transformers/dynamic_module_utils.py +1 -2
  9. transformers/feature_extraction_utils.py +1 -2
  10. transformers/file_utils.py +0 -1
  11. transformers/generation/__init__.py +11 -1
  12. transformers/generation/configuration_utils.py +3 -2
  13. transformers/generation/continuous_batching/__init__.py +4 -0
  14. transformers/generation/continuous_batching/continuous_api.py +134 -79
  15. transformers/image_processing_base.py +1 -2
  16. transformers/integrations/__init__.py +4 -2
  17. transformers/integrations/accelerate.py +15 -3
  18. transformers/integrations/aqlm.py +38 -66
  19. transformers/integrations/awq.py +48 -514
  20. transformers/integrations/bitnet.py +45 -100
  21. transformers/integrations/bitsandbytes.py +79 -191
  22. transformers/integrations/deepspeed.py +1 -0
  23. transformers/integrations/eetq.py +84 -79
  24. transformers/integrations/fbgemm_fp8.py +191 -145
  25. transformers/integrations/finegrained_fp8.py +236 -193
  26. transformers/integrations/fp_quant.py +92 -0
  27. transformers/integrations/ggml.py +11 -1
  28. transformers/integrations/higgs.py +40 -62
  29. transformers/integrations/hub_kernels.py +42 -3
  30. transformers/integrations/integration_utils.py +10 -0
  31. transformers/integrations/mxfp4.py +25 -65
  32. transformers/integrations/peft.py +7 -29
  33. transformers/integrations/quanto.py +73 -55
  34. transformers/integrations/quark.py +55 -0
  35. transformers/integrations/spqr.py +44 -90
  36. transformers/integrations/torchao.py +32 -38
  37. transformers/integrations/vptq.py +42 -59
  38. transformers/modelcard.py +1 -2
  39. transformers/modeling_gguf_pytorch_utils.py +8 -0
  40. transformers/modeling_rope_utils.py +30 -6
  41. transformers/modeling_utils.py +116 -112
  42. transformers/models/__init__.py +3 -0
  43. transformers/models/afmoe/modeling_afmoe.py +4 -4
  44. transformers/models/albert/tokenization_albert.py +6 -12
  45. transformers/models/align/modeling_align.py +2 -0
  46. transformers/models/altclip/modeling_altclip.py +4 -0
  47. transformers/models/apertus/modeling_apertus.py +4 -4
  48. transformers/models/arcee/modeling_arcee.py +4 -4
  49. transformers/models/aria/modeling_aria.py +4 -4
  50. transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
  51. transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
  52. transformers/models/auto/configuration_auto.py +11 -0
  53. transformers/models/auto/feature_extraction_auto.py +2 -0
  54. transformers/models/auto/image_processing_auto.py +1 -0
  55. transformers/models/auto/modeling_auto.py +6 -0
  56. transformers/models/auto/processing_auto.py +18 -10
  57. transformers/models/auto/tokenization_auto.py +74 -472
  58. transformers/models/autoformer/modeling_autoformer.py +4 -0
  59. transformers/models/bamba/modeling_bamba.py +4 -3
  60. transformers/models/bark/modeling_bark.py +2 -0
  61. transformers/models/bart/modeling_bart.py +7 -0
  62. transformers/models/barthez/tokenization_barthez.py +5 -10
  63. transformers/models/beit/modeling_beit.py +6 -1
  64. transformers/models/bert/tokenization_bert.py +8 -21
  65. transformers/models/big_bird/modeling_big_bird.py +6 -0
  66. transformers/models/big_bird/tokenization_big_bird.py +18 -42
  67. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +8 -2
  68. transformers/models/biogpt/modeling_biogpt.py +2 -0
  69. transformers/models/biogpt/modular_biogpt.py +2 -0
  70. transformers/models/bit/modeling_bit.py +11 -2
  71. transformers/models/bitnet/modeling_bitnet.py +4 -4
  72. transformers/models/blenderbot/modeling_blenderbot.py +5 -0
  73. transformers/models/blenderbot/tokenization_blenderbot.py +12 -16
  74. transformers/models/blenderbot_small/modeling_blenderbot_small.py +5 -0
  75. transformers/models/blip/modeling_blip_text.py +2 -0
  76. transformers/models/blip_2/modeling_blip_2.py +2 -1
  77. transformers/models/bloom/modeling_bloom.py +4 -0
  78. transformers/models/blt/modeling_blt.py +2 -2
  79. transformers/models/blt/modular_blt.py +2 -2
  80. transformers/models/bridgetower/modeling_bridgetower.py +5 -1
  81. transformers/models/bros/modeling_bros.py +4 -0
  82. transformers/models/camembert/tokenization_camembert.py +8 -12
  83. transformers/models/canine/modeling_canine.py +5 -0
  84. transformers/models/chameleon/modeling_chameleon.py +2 -1
  85. transformers/models/chinese_clip/modeling_chinese_clip.py +3 -0
  86. transformers/models/clap/modeling_clap.py +5 -0
  87. transformers/models/clip/tokenization_clip.py +22 -44
  88. transformers/models/clipseg/modeling_clipseg.py +5 -0
  89. transformers/models/clvp/modeling_clvp.py +5 -0
  90. transformers/models/clvp/tokenization_clvp.py +1 -63
  91. transformers/models/code_llama/tokenization_code_llama.py +20 -43
  92. transformers/models/codegen/tokenization_codegen.py +14 -43
  93. transformers/models/cohere/modeling_cohere.py +4 -3
  94. transformers/models/cohere/modular_cohere.py +2 -1
  95. transformers/models/cohere/tokenization_cohere.py +12 -42
  96. transformers/models/cohere2/modeling_cohere2.py +7 -6
  97. transformers/models/cohere2/modular_cohere2.py +5 -5
  98. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -3
  99. transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
  100. transformers/models/colqwen2/modeling_colqwen2.py +1 -0
  101. transformers/models/colqwen2/modular_colqwen2.py +1 -0
  102. transformers/models/conditional_detr/modeling_conditional_detr.py +5 -0
  103. transformers/models/convbert/modeling_convbert.py +6 -0
  104. transformers/models/convnext/modeling_convnext.py +2 -4
  105. transformers/models/convnextv2/modeling_convnextv2.py +2 -4
  106. transformers/models/csm/modeling_csm.py +4 -3
  107. transformers/models/ctrl/modeling_ctrl.py +1 -0
  108. transformers/models/cvt/modeling_cvt.py +2 -0
  109. transformers/models/cwm/modeling_cwm.py +4 -4
  110. transformers/models/d_fine/modeling_d_fine.py +2 -0
  111. transformers/models/d_fine/modular_d_fine.py +1 -0
  112. transformers/models/dab_detr/modeling_dab_detr.py +4 -0
  113. transformers/models/dac/modeling_dac.py +2 -2
  114. transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
  115. transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
  116. transformers/models/dbrx/modeling_dbrx.py +2 -2
  117. transformers/models/deberta/modeling_deberta.py +5 -0
  118. transformers/models/deberta/tokenization_deberta.py +11 -20
  119. transformers/models/deberta_v2/modeling_deberta_v2.py +6 -0
  120. transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
  121. transformers/models/decision_transformer/modeling_decision_transformer.py +4 -1
  122. transformers/models/deepseek_v2/modeling_deepseek_v2.py +2 -3
  123. transformers/models/deepseek_v2/modular_deepseek_v2.py +2 -2
  124. transformers/models/deepseek_v3/modeling_deepseek_v3.py +3 -2
  125. transformers/models/deepseek_v3/modular_deepseek_v3.py +1 -0
  126. transformers/models/deformable_detr/modeling_deformable_detr.py +4 -0
  127. transformers/models/depth_anything/modeling_depth_anything.py +1 -0
  128. transformers/models/depth_pro/modeling_depth_pro.py +2 -0
  129. transformers/models/detr/modeling_detr.py +5 -0
  130. transformers/models/dia/modeling_dia.py +4 -3
  131. transformers/models/dia/modular_dia.py +0 -1
  132. transformers/models/diffllama/modeling_diffllama.py +2 -2
  133. transformers/models/dinat/modeling_dinat.py +3 -0
  134. transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
  135. transformers/models/dinov3_vit/modeling_dinov3_vit.py +2 -2
  136. transformers/models/dinov3_vit/modular_dinov3_vit.py +2 -2
  137. transformers/models/distilbert/tokenization_distilbert.py +13 -0
  138. transformers/models/doge/modeling_doge.py +2 -3
  139. transformers/models/doge/modular_doge.py +0 -1
  140. transformers/models/donut/modeling_donut_swin.py +2 -0
  141. transformers/models/dots1/modeling_dots1.py +10 -7
  142. transformers/models/dots1/modular_dots1.py +5 -3
  143. transformers/models/dpr/modeling_dpr.py +5 -0
  144. transformers/models/dpr/tokenization_dpr.py +12 -0
  145. transformers/models/edgetam/modeling_edgetam.py +1 -1
  146. transformers/models/edgetam_video/modeling_edgetam_video.py +1 -0
  147. transformers/models/edgetam_video/modular_edgetam_video.py +1 -0
  148. transformers/models/efficientloftr/modeling_efficientloftr.py +2 -2
  149. transformers/models/efficientnet/modeling_efficientnet.py +2 -0
  150. transformers/models/emu3/modeling_emu3.py +4 -4
  151. transformers/models/eomt/image_processing_eomt.py +13 -1
  152. transformers/models/eomt/image_processing_eomt_fast.py +14 -2
  153. transformers/models/ernie4_5/modeling_ernie4_5.py +4 -4
  154. transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
  155. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +5 -5
  156. transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +2 -2
  157. transformers/models/esm/modeling_esmfold.py +5 -4
  158. transformers/models/evolla/modeling_evolla.py +4 -4
  159. transformers/models/exaone4/modeling_exaone4.py +2 -2
  160. transformers/models/exaone4/modular_exaone4.py +0 -1
  161. transformers/models/falcon/modeling_falcon.py +6 -1
  162. transformers/models/falcon_h1/modeling_falcon_h1.py +4 -3
  163. transformers/models/falcon_mamba/modeling_falcon_mamba.py +25 -35
  164. transformers/models/falcon_mamba/modular_falcon_mamba.py +12 -31
  165. transformers/{kernels/falcon_mamba → models/fast_vlm}/__init__.py +15 -3
  166. transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
  167. transformers/models/fast_vlm/modeling_fast_vlm.py +455 -0
  168. transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
  169. transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +8 -3
  170. transformers/models/flaubert/modeling_flaubert.py +7 -0
  171. transformers/models/flava/modeling_flava.py +6 -1
  172. transformers/models/flex_olmo/modeling_flex_olmo.py +4 -5
  173. transformers/models/florence2/modeling_florence2.py +2 -1
  174. transformers/models/florence2/modular_florence2.py +2 -1
  175. transformers/models/fnet/modeling_fnet.py +7 -0
  176. transformers/models/focalnet/modeling_focalnet.py +4 -0
  177. transformers/models/fsmt/modeling_fsmt.py +2 -0
  178. transformers/models/funnel/modeling_funnel.py +8 -0
  179. transformers/models/funnel/tokenization_funnel.py +17 -24
  180. transformers/models/fuyu/processing_fuyu.py +3 -3
  181. transformers/models/gemma/modeling_gemma.py +4 -4
  182. transformers/models/gemma/tokenization_gemma.py +10 -27
  183. transformers/models/gemma2/modeling_gemma2.py +4 -4
  184. transformers/models/gemma2/modular_gemma2.py +2 -1
  185. transformers/models/gemma3/modeling_gemma3.py +14 -84
  186. transformers/models/gemma3/modular_gemma3.py +12 -81
  187. transformers/models/gemma3n/modeling_gemma3n.py +18 -209
  188. transformers/models/gemma3n/modular_gemma3n.py +17 -59
  189. transformers/models/git/modeling_git.py +2 -0
  190. transformers/models/glm/modeling_glm.py +4 -4
  191. transformers/models/glm4/modeling_glm4.py +4 -4
  192. transformers/models/glm4_moe/modeling_glm4_moe.py +5 -3
  193. transformers/models/glm4v/configuration_glm4v.py +3 -1
  194. transformers/models/glm4v/modeling_glm4v.py +3 -3
  195. transformers/models/glm4v/modular_glm4v.py +6 -4
  196. transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
  197. transformers/models/glm4v_moe/modeling_glm4v_moe.py +6 -5
  198. transformers/models/glm4v_moe/modular_glm4v_moe.py +1 -1
  199. transformers/models/glpn/modeling_glpn.py +2 -0
  200. transformers/models/gpt2/modeling_gpt2.py +5 -1
  201. transformers/models/gpt2/tokenization_gpt2.py +16 -44
  202. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +1 -0
  203. transformers/models/gpt_neo/modeling_gpt_neo.py +4 -0
  204. transformers/models/gpt_neox/modeling_gpt_neox.py +5 -2
  205. transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
  206. transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
  207. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +3 -1
  208. transformers/models/gpt_oss/modeling_gpt_oss.py +5 -6
  209. transformers/models/gpt_oss/modular_gpt_oss.py +3 -5
  210. transformers/models/gptj/modeling_gptj.py +3 -0
  211. transformers/models/granite/modeling_granite.py +4 -4
  212. transformers/models/granitemoe/modeling_granitemoe.py +4 -6
  213. transformers/models/granitemoe/modular_granitemoe.py +0 -2
  214. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +4 -6
  215. transformers/models/granitemoeshared/modeling_granitemoeshared.py +4 -6
  216. transformers/models/grounding_dino/modeling_grounding_dino.py +4 -0
  217. transformers/models/groupvit/modeling_groupvit.py +3 -0
  218. transformers/models/helium/modeling_helium.py +4 -3
  219. transformers/models/herbert/tokenization_herbert.py +9 -25
  220. transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -1
  221. transformers/models/hgnet_v2/modular_hgnet_v2.py +6 -1
  222. transformers/models/hiera/modeling_hiera.py +4 -0
  223. transformers/models/hubert/modeling_hubert.py +3 -0
  224. transformers/models/hubert/modular_hubert.py +1 -0
  225. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +4 -4
  226. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +4 -4
  227. transformers/models/ibert/modeling_ibert.py +6 -0
  228. transformers/models/idefics/modeling_idefics.py +5 -21
  229. transformers/models/imagegpt/modeling_imagegpt.py +2 -1
  230. transformers/models/informer/modeling_informer.py +4 -0
  231. transformers/models/informer/modular_informer.py +1 -0
  232. transformers/models/internvl/modeling_internvl.py +2 -4
  233. transformers/models/internvl/modular_internvl.py +2 -4
  234. transformers/models/jamba/modeling_jamba.py +2 -2
  235. transformers/models/janus/modeling_janus.py +1 -0
  236. transformers/models/janus/modular_janus.py +1 -0
  237. transformers/models/jetmoe/modeling_jetmoe.py +2 -2
  238. transformers/models/kosmos2/modeling_kosmos2.py +1 -0
  239. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +3 -1
  240. transformers/models/lasr/__init__.py +29 -0
  241. transformers/models/lasr/configuration_lasr.py +244 -0
  242. transformers/models/lasr/feature_extraction_lasr.py +277 -0
  243. transformers/models/lasr/modeling_lasr.py +729 -0
  244. transformers/models/lasr/modular_lasr.py +569 -0
  245. transformers/models/lasr/processing_lasr.py +96 -0
  246. transformers/models/lasr/tokenization_lasr.py +186 -0
  247. transformers/models/layoutlm/modeling_layoutlm.py +5 -0
  248. transformers/models/layoutlmv2/modeling_layoutlmv2.py +4 -0
  249. transformers/models/layoutlmv2/tokenization_layoutlmv2.py +10 -53
  250. transformers/models/layoutlmv3/modeling_layoutlmv3.py +4 -0
  251. transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
  252. transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
  253. transformers/models/led/modeling_led.py +6 -0
  254. transformers/models/levit/modeling_levit.py +3 -0
  255. transformers/models/lfm2/modeling_lfm2.py +4 -5
  256. transformers/models/lfm2/modular_lfm2.py +0 -1
  257. transformers/models/lfm2_moe/modeling_lfm2_moe.py +4 -5
  258. transformers/models/lightglue/modeling_lightglue.py +3 -1
  259. transformers/models/lightglue/modular_lightglue.py +1 -0
  260. transformers/models/lilt/modeling_lilt.py +4 -0
  261. transformers/models/llama/modeling_llama.py +4 -4
  262. transformers/models/llama/tokenization_llama.py +15 -43
  263. transformers/models/llama4/modeling_llama4.py +3 -2
  264. transformers/models/longcat_flash/modeling_longcat_flash.py +4 -4
  265. transformers/models/longcat_flash/modular_longcat_flash.py +2 -2
  266. transformers/models/longformer/modeling_longformer.py +6 -0
  267. transformers/models/longt5/modeling_longt5.py +4 -0
  268. transformers/models/luke/modeling_luke.py +9 -0
  269. transformers/models/luke/tokenization_luke.py +11 -38
  270. transformers/models/lxmert/modeling_lxmert.py +2 -0
  271. transformers/models/m2m_100/modeling_m2m_100.py +4 -0
  272. transformers/models/mamba/modeling_mamba.py +14 -22
  273. transformers/models/marian/modeling_marian.py +5 -0
  274. transformers/models/markuplm/modeling_markuplm.py +4 -0
  275. transformers/models/markuplm/tokenization_markuplm.py +28 -61
  276. transformers/models/mask2former/modeling_mask2former.py +2 -0
  277. transformers/models/maskformer/modeling_maskformer.py +2 -0
  278. transformers/models/maskformer/modeling_maskformer_swin.py +2 -0
  279. transformers/models/mbart/modeling_mbart.py +7 -0
  280. transformers/models/mbart/tokenization_mbart.py +11 -52
  281. transformers/models/mbart50/tokenization_mbart50.py +7 -10
  282. transformers/models/megatron_bert/modeling_megatron_bert.py +7 -0
  283. transformers/models/mgp_str/modeling_mgp_str.py +2 -0
  284. transformers/models/mimi/modeling_mimi.py +3 -1
  285. transformers/models/minimax/modeling_minimax.py +4 -4
  286. transformers/models/ministral/modeling_ministral.py +4 -4
  287. transformers/models/ministral3/configuration_ministral3.py +1 -1
  288. transformers/models/ministral3/modeling_ministral3.py +4 -3
  289. transformers/models/mistral/modeling_mistral.py +4 -3
  290. transformers/models/mixtral/modeling_mixtral.py +4 -4
  291. transformers/models/mllama/modeling_mllama.py +2 -2
  292. transformers/models/mluke/tokenization_mluke.py +6 -6
  293. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +4 -0
  294. transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
  295. transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
  296. transformers/models/mobilevit/modeling_mobilevit.py +3 -0
  297. transformers/models/mobilevitv2/modeling_mobilevitv2.py +3 -0
  298. transformers/models/modernbert/modeling_modernbert.py +4 -1
  299. transformers/models/modernbert/modular_modernbert.py +2 -0
  300. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +8 -9
  301. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +6 -7
  302. transformers/models/moonshine/modeling_moonshine.py +4 -2
  303. transformers/models/moshi/modeling_moshi.py +5 -2
  304. transformers/models/mpnet/modeling_mpnet.py +5 -0
  305. transformers/models/mpnet/tokenization_mpnet.py +5 -13
  306. transformers/models/mpt/modeling_mpt.py +2 -0
  307. transformers/models/mra/modeling_mra.py +6 -0
  308. transformers/models/mt5/modeling_mt5.py +7 -0
  309. transformers/models/musicgen/modeling_musicgen.py +2 -0
  310. transformers/models/musicgen_melody/modeling_musicgen_melody.py +3 -0
  311. transformers/models/mvp/modeling_mvp.py +7 -0
  312. transformers/models/nanochat/modeling_nanochat.py +4 -4
  313. transformers/models/nemotron/modeling_nemotron.py +4 -2
  314. transformers/models/nllb/tokenization_nllb.py +8 -22
  315. transformers/models/nougat/tokenization_nougat.py +11 -59
  316. transformers/models/nystromformer/modeling_nystromformer.py +6 -0
  317. transformers/models/olmo/modeling_olmo.py +4 -4
  318. transformers/models/olmo/modular_olmo.py +2 -2
  319. transformers/models/olmo2/modeling_olmo2.py +4 -5
  320. transformers/models/olmo2/modular_olmo2.py +0 -1
  321. transformers/models/olmo3/modeling_olmo3.py +4 -4
  322. transformers/models/olmoe/modeling_olmoe.py +4 -4
  323. transformers/models/omdet_turbo/modeling_omdet_turbo.py +2 -0
  324. transformers/models/oneformer/modeling_oneformer.py +4 -1
  325. transformers/models/openai/modeling_openai.py +3 -0
  326. transformers/models/openai/tokenization_openai.py +10 -46
  327. transformers/models/opt/modeling_opt.py +2 -0
  328. transformers/models/owlv2/modeling_owlv2.py +4 -0
  329. transformers/models/owlvit/modeling_owlvit.py +4 -0
  330. transformers/models/paddleocr_vl/__init__.py +32 -0
  331. transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
  332. transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +503 -0
  333. transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
  334. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1668 -0
  335. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1349 -0
  336. transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
  337. transformers/models/parakeet/configuration_parakeet.py +4 -6
  338. transformers/models/parakeet/modeling_parakeet.py +9 -6
  339. transformers/models/parakeet/modular_parakeet.py +2 -2
  340. transformers/models/parakeet/processing_parakeet.py +1 -0
  341. transformers/models/patchtsmixer/modeling_patchtsmixer.py +6 -0
  342. transformers/models/patchtst/modeling_patchtst.py +20 -2
  343. transformers/models/pegasus/modeling_pegasus.py +5 -0
  344. transformers/models/pegasus/tokenization_pegasus.py +17 -44
  345. transformers/models/pegasus_x/modeling_pegasus_x.py +4 -0
  346. transformers/models/perceiver/modeling_perceiver.py +8 -0
  347. transformers/models/persimmon/modeling_persimmon.py +2 -1
  348. transformers/models/phi/modeling_phi.py +4 -5
  349. transformers/models/phi/modular_phi.py +0 -1
  350. transformers/models/phi3/modeling_phi3.py +2 -1
  351. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +5 -5
  352. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +4 -4
  353. transformers/models/phimoe/modeling_phimoe.py +4 -4
  354. transformers/models/phimoe/modular_phimoe.py +2 -2
  355. transformers/models/pix2struct/modeling_pix2struct.py +2 -0
  356. transformers/models/pixtral/modeling_pixtral.py +2 -1
  357. transformers/models/plbart/modeling_plbart.py +6 -0
  358. transformers/models/plbart/modular_plbart.py +2 -0
  359. transformers/models/plbart/tokenization_plbart.py +0 -2
  360. transformers/models/poolformer/modeling_poolformer.py +2 -0
  361. transformers/models/pop2piano/modeling_pop2piano.py +2 -0
  362. transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
  363. transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
  364. transformers/models/prophetnet/modeling_prophetnet.py +3 -0
  365. transformers/models/pvt/modeling_pvt.py +2 -0
  366. transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
  367. transformers/models/qwen2/modeling_qwen2.py +4 -4
  368. transformers/models/qwen2/tokenization_qwen2.py +14 -18
  369. transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
  370. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +13 -16
  371. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +14 -16
  372. transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
  373. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +5 -6
  374. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +3 -5
  375. transformers/models/qwen2_audio/modeling_qwen2_audio.py +2 -0
  376. transformers/models/qwen2_moe/modeling_qwen2_moe.py +4 -4
  377. transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
  378. transformers/models/qwen2_vl/modeling_qwen2_vl.py +6 -16
  379. transformers/models/qwen3/modeling_qwen3.py +4 -4
  380. transformers/models/qwen3_moe/modeling_qwen3_moe.py +4 -4
  381. transformers/models/qwen3_next/modeling_qwen3_next.py +4 -3
  382. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +21 -23
  383. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +14 -16
  384. transformers/models/qwen3_vl/modeling_qwen3_vl.py +39 -37
  385. transformers/models/qwen3_vl/modular_qwen3_vl.py +37 -35
  386. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +39 -37
  387. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +4 -1
  388. transformers/models/rag/modeling_rag.py +1 -0
  389. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +15 -1
  390. transformers/models/reformer/modeling_reformer.py +4 -0
  391. transformers/models/reformer/tokenization_reformer.py +11 -28
  392. transformers/models/regnet/modeling_regnet.py +6 -1
  393. transformers/models/rembert/modeling_rembert.py +6 -0
  394. transformers/models/rembert/tokenization_rembert.py +3 -10
  395. transformers/models/resnet/modeling_resnet.py +11 -2
  396. transformers/models/roberta/tokenization_roberta.py +18 -27
  397. transformers/models/roformer/modeling_roformer.py +6 -0
  398. transformers/models/roformer/tokenization_roformer.py +77 -412
  399. transformers/models/rt_detr/modeling_rt_detr.py +2 -0
  400. transformers/models/rt_detr/modeling_rt_detr_resnet.py +5 -1
  401. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +2 -0
  402. transformers/models/rwkv/modeling_rwkv.py +1 -0
  403. transformers/models/sam2/modeling_sam2.py +2 -2
  404. transformers/models/sam2/modular_sam2.py +2 -2
  405. transformers/models/sam2_video/modeling_sam2_video.py +1 -0
  406. transformers/models/sam2_video/modular_sam2_video.py +1 -0
  407. transformers/models/sam3/modeling_sam3.py +77 -80
  408. transformers/models/sam3_tracker/modeling_sam3_tracker.py +6 -1
  409. transformers/models/sam3_tracker/modular_sam3_tracker.py +6 -1
  410. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +1 -0
  411. transformers/models/sam3_video/modeling_sam3_video.py +1 -0
  412. transformers/models/seamless_m4t/modeling_seamless_m4t.py +5 -1
  413. transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
  414. transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +5 -1
  415. transformers/models/seed_oss/modeling_seed_oss.py +2 -2
  416. transformers/models/segformer/modeling_segformer.py +4 -1
  417. transformers/models/seggpt/modeling_seggpt.py +2 -0
  418. transformers/models/sew/modeling_sew.py +3 -0
  419. transformers/models/sew/modular_sew.py +1 -0
  420. transformers/models/sew_d/modeling_sew_d.py +3 -0
  421. transformers/models/siglip2/modeling_siglip2.py +4 -0
  422. transformers/models/siglip2/modular_siglip2.py +4 -0
  423. transformers/models/smollm3/modeling_smollm3.py +4 -4
  424. transformers/models/smolvlm/processing_smolvlm.py +0 -7
  425. transformers/models/speech_to_text/modeling_speech_to_text.py +4 -0
  426. transformers/models/speecht5/modeling_speecht5.py +13 -1
  427. transformers/models/splinter/modeling_splinter.py +3 -0
  428. transformers/models/splinter/tokenization_splinter.py +9 -28
  429. transformers/models/squeezebert/modeling_squeezebert.py +6 -0
  430. transformers/models/stablelm/modeling_stablelm.py +3 -1
  431. transformers/models/starcoder2/modeling_starcoder2.py +4 -3
  432. transformers/models/superglue/modeling_superglue.py +1 -0
  433. transformers/models/superpoint/modeling_superpoint.py +1 -0
  434. transformers/models/swiftformer/modeling_swiftformer.py +2 -0
  435. transformers/models/swin/modeling_swin.py +4 -0
  436. transformers/models/swin2sr/modeling_swin2sr.py +2 -0
  437. transformers/models/swinv2/modeling_swinv2.py +4 -0
  438. transformers/models/t5/modeling_t5.py +7 -0
  439. transformers/models/t5/tokenization_t5.py +4 -8
  440. transformers/models/t5gemma/modeling_t5gemma.py +5 -5
  441. transformers/models/t5gemma2/modeling_t5gemma2.py +6 -6
  442. transformers/models/table_transformer/modeling_table_transformer.py +4 -0
  443. transformers/models/tapas/modeling_tapas.py +3 -0
  444. transformers/models/textnet/modeling_textnet.py +11 -2
  445. transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
  446. transformers/models/timesfm/modeling_timesfm.py +2 -0
  447. transformers/models/timesfm/modular_timesfm.py +2 -0
  448. transformers/models/timesformer/modeling_timesformer.py +2 -0
  449. transformers/models/timm_wrapper/modeling_timm_wrapper.py +1 -1
  450. transformers/models/trocr/modeling_trocr.py +2 -0
  451. transformers/models/tvp/modeling_tvp.py +2 -0
  452. transformers/models/udop/modeling_udop.py +4 -0
  453. transformers/models/udop/tokenization_udop.py +5 -13
  454. transformers/models/umt5/modeling_umt5.py +7 -0
  455. transformers/models/unispeech/modeling_unispeech.py +4 -0
  456. transformers/models/unispeech/modular_unispeech.py +2 -0
  457. transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
  458. transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
  459. transformers/models/univnet/modeling_univnet.py +1 -0
  460. transformers/models/upernet/modeling_upernet.py +1 -0
  461. transformers/models/vaultgemma/modeling_vaultgemma.py +4 -4
  462. transformers/models/vilt/modeling_vilt.py +6 -0
  463. transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
  464. transformers/models/visual_bert/modeling_visual_bert.py +6 -0
  465. transformers/models/vitdet/modeling_vitdet.py +2 -0
  466. transformers/models/vitmatte/modeling_vitmatte.py +1 -0
  467. transformers/models/vits/modeling_vits.py +1 -0
  468. transformers/models/vjepa2/modeling_vjepa2.py +1 -0
  469. transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
  470. transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +5 -0
  471. transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +5 -0
  472. transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +6 -0
  473. transformers/models/wavlm/modeling_wavlm.py +5 -0
  474. transformers/models/whisper/modeling_whisper.py +6 -0
  475. transformers/models/whisper/tokenization_whisper.py +4 -15
  476. transformers/models/x_clip/modeling_x_clip.py +3 -0
  477. transformers/models/xglm/modeling_xglm.py +1 -0
  478. transformers/models/xglm/tokenization_xglm.py +4 -9
  479. transformers/models/xlm/modeling_xlm.py +5 -0
  480. transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
  481. transformers/models/xlnet/tokenization_xlnet.py +3 -7
  482. transformers/models/yoso/modeling_yoso.py +6 -0
  483. transformers/models/zamba/modeling_zamba.py +2 -0
  484. transformers/models/zamba2/modeling_zamba2.py +4 -2
  485. transformers/models/zamba2/modular_zamba2.py +1 -1
  486. transformers/models/zoedepth/modeling_zoedepth.py +1 -0
  487. transformers/pipelines/__init__.py +2 -3
  488. transformers/pipelines/base.py +1 -9
  489. transformers/pipelines/document_question_answering.py +3 -1
  490. transformers/pipelines/text_generation.py +1 -1
  491. transformers/processing_utils.py +23 -11
  492. transformers/quantizers/base.py +35 -110
  493. transformers/quantizers/quantizer_aqlm.py +1 -5
  494. transformers/quantizers/quantizer_auto_round.py +1 -2
  495. transformers/quantizers/quantizer_awq.py +17 -81
  496. transformers/quantizers/quantizer_bitnet.py +3 -8
  497. transformers/quantizers/quantizer_bnb_4bit.py +13 -110
  498. transformers/quantizers/quantizer_bnb_8bit.py +16 -92
  499. transformers/quantizers/quantizer_compressed_tensors.py +1 -5
  500. transformers/quantizers/quantizer_eetq.py +14 -62
  501. transformers/quantizers/quantizer_fbgemm_fp8.py +34 -125
  502. transformers/quantizers/quantizer_finegrained_fp8.py +13 -105
  503. transformers/quantizers/quantizer_fp_quant.py +48 -78
  504. transformers/quantizers/quantizer_gptq.py +7 -24
  505. transformers/quantizers/quantizer_higgs.py +40 -54
  506. transformers/quantizers/quantizer_hqq.py +144 -153
  507. transformers/quantizers/quantizer_mxfp4.py +13 -167
  508. transformers/quantizers/quantizer_quanto.py +20 -64
  509. transformers/quantizers/quantizer_quark.py +36 -17
  510. transformers/quantizers/quantizer_spqr.py +1 -4
  511. transformers/quantizers/quantizer_torchao.py +23 -202
  512. transformers/quantizers/quantizer_vptq.py +8 -22
  513. transformers/quantizers/quantizers_utils.py +20 -0
  514. transformers/testing_utils.py +297 -36
  515. transformers/tokenization_mistral_common.py +4 -0
  516. transformers/tokenization_utils_base.py +113 -222
  517. transformers/tokenization_utils_tokenizers.py +168 -107
  518. transformers/trainer.py +28 -31
  519. transformers/trainer_jit_checkpoint.py +126 -0
  520. transformers/trainer_utils.py +1 -1
  521. transformers/training_args.py +66 -28
  522. transformers/utils/__init__.py +3 -4
  523. transformers/utils/auto_docstring.py +1 -0
  524. transformers/utils/generic.py +27 -1
  525. transformers/utils/hub.py +5 -15
  526. transformers/utils/import_utils.py +61 -16
  527. transformers/utils/kernel_config.py +4 -2
  528. transformers/utils/loading_report.py +19 -10
  529. transformers/utils/quantization_config.py +75 -242
  530. transformers/video_processing_utils.py +1 -2
  531. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/METADATA +274 -227
  532. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/RECORD +536 -520
  533. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/WHEEL +1 -1
  534. transformers/kernels/__init__.py +0 -0
  535. transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
  536. transformers/models/roformer/tokenization_roformer_fast.py +0 -160
  537. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/entry_points.txt +0 -0
  538. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info/licenses}/LICENSE +0 -0
  539. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/top_level.txt +0 -0
@@ -20,26 +20,20 @@ class FineGrainedFP8HfQuantizer(HfQuantizer):
20
20
  Supports both e4m3fn formats based on platform.
21
21
  """
22
22
 
23
- requires_parameters_quantization = True
24
23
  requires_calibration = False
25
- required_packages = ["accelerate"]
26
24
 
27
25
  def __init__(self, quantization_config, **kwargs):
28
26
  super().__init__(quantization_config, **kwargs)
29
- self.quantization_config = quantization_config
30
27
 
31
28
  def validate_environment(self, *args, **kwargs):
32
- if not is_torch_available():
33
- raise ImportError(
34
- "Using fp8 quantization requires torch >= 2.1.0"
35
- "Please install the latest version of torch ( pip install --upgrade torch )"
36
- )
37
-
38
29
  if not is_accelerate_available():
39
30
  raise ImportError("Loading an FP8 quantized model requires accelerate (`pip install accelerate`)")
40
31
 
41
- if (not (torch.cuda.is_available() or is_torch_xpu_available())) and not self.quantization_config.dequantize:
42
- if self.pre_quantized:
32
+ if self.quantization_config.dequantize:
33
+ return
34
+
35
+ if not torch.cuda.is_available() and not is_torch_xpu_available():
36
+ if self.pre_quantized and not self.quantization_config.dequantize:
43
37
  logger.warning_once(
44
38
  "Using FP8 quantized models requires a GPU or XPU, we will default to dequantizing the model to bf16 since no GPU or XPU is available"
45
39
  )
@@ -64,11 +58,12 @@ class FineGrainedFP8HfQuantizer(HfQuantizer):
64
58
  "your model on a GPU or XPU device in order to run your model. To remove this warning, "
65
59
  "pass device_map = 'cuda' or 'xpu'. "
66
60
  )
67
- elif device_map is not None:
61
+ elif isinstance(device_map, dict):
68
62
  if (
69
63
  not self.pre_quantized
70
- and isinstance(device_map, dict)
71
- and ("cpu" in device_map.values() or "disk" in device_map.values())
64
+ and len(device_map) > 1
65
+ and "cpu" in device_map.values()
66
+ or "disk" in device_map.values()
72
67
  ):
73
68
  raise ValueError(
74
69
  "You are attempting to load an FP8 model with a device_map that contains a cpu/disk device."
@@ -76,76 +71,6 @@ class FineGrainedFP8HfQuantizer(HfQuantizer):
76
71
  "Please use a quantized checkpoint or remove the cpu/disk device from the device_map."
77
72
  )
78
73
 
79
- def update_dtype(self, dtype: "torch.dtype") -> "torch.dtype":
80
- if dtype is None:
81
- logger.info("Setting dtype to torch.float32 as no dtype was specified in from_pretrained")
82
- dtype = torch.float32
83
- return dtype
84
-
85
- # TODO: make this into a `ConversionType` ops -> potentially requires all weights on all ranks
86
- # depending on the layer type (moe -> no if ep)
87
- def create_quantized_param(
88
- self,
89
- model: "PreTrainedModel",
90
- param_value: "torch.Tensor",
91
- param_name: str,
92
- target_device: "torch.device",
93
- **kwargs,
94
- ):
95
- from ..integrations.finegrained_fp8 import FP8Linear
96
- from ..modeling_utils import _load_parameter_into_model
97
-
98
- # Sanity checks
99
- module, tensor_name = get_module_from_name(model, param_name)
100
- if isinstance(module, FP8Linear):
101
- if self.pre_quantized or tensor_name == "bias":
102
- if tensor_name == "weight" and param_value.dtype != torch.float8_e4m3fn:
103
- raise ValueError("Expect quantized weights but got an unquantized weight")
104
- else:
105
- return
106
- # if tensor_name == "weight_scale_inv":
107
- # raise ValueError("Expect unquantized weights but got a quantized weight_scale")
108
-
109
- param_value = param_value.to(target_device)
110
-
111
- # Get FP8 min/max values
112
- fp8_min = torch.finfo(torch.float8_e4m3fn).min
113
- fp8_max = torch.finfo(torch.float8_e4m3fn).max
114
-
115
- block_size_m, block_size_n = self.quantization_config.weight_block_size
116
-
117
- rows, cols = param_value.shape[-2:]
118
-
119
- if rows % block_size_m != 0 or cols % block_size_n != 0:
120
- raise ValueError(
121
- f"Matrix dimensions ({rows}, {cols}) must be divisible by block sizes ({block_size_m}, {block_size_n})"
122
- )
123
- param_value_orig_shape = param_value.shape
124
-
125
- param_value = param_value.reshape(
126
- -1, rows // block_size_m, block_size_m, cols // block_size_n, block_size_n
127
- ).permute(0, 1, 3, 2, 4)
128
-
129
- # Calculate scaling factor for each block
130
- max_abs = torch.amax(torch.abs(param_value), dim=(-1, -2))
131
- scale = fp8_max / max_abs
132
- scale_orig_shape = scale.shape
133
- scale = scale.unsqueeze(-1).unsqueeze(-1)
134
-
135
- # Quantize the weights
136
- quantized_param = torch.clamp(param_value * scale, min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
137
-
138
- quantized_param = quantized_param.permute(0, 1, 3, 2, 4)
139
- # Reshape back to matrix shape
140
- quantized_param = quantized_param.reshape(param_value_orig_shape)
141
-
142
- # Reshape scale to match the number of blocks
143
- scale = scale.reshape(scale_orig_shape).squeeze().reciprocal()
144
-
145
- # Load into the model
146
- _load_parameter_into_model(model, param_name, quantized_param)
147
- _load_parameter_into_model(model, param_name.rsplit(".", 1)[0] + ".weight_scale_inv", scale)
148
-
149
74
  def param_needs_quantization(self, model: "PreTrainedModel", param_name: str, **kwargs) -> bool:
150
75
  from ..integrations.finegrained_fp8 import FP8Expert, FP8Linear
151
76
 
@@ -165,35 +90,17 @@ class FineGrainedFP8HfQuantizer(HfQuantizer):
165
90
  ):
166
91
  from ..integrations.finegrained_fp8 import replace_with_fp8_linear
167
92
 
168
- # takes 2 fucking seconds
169
93
  self.modules_to_not_convert = self.get_modules_to_not_convert(
170
94
  model, self.quantization_config.modules_to_not_convert, keep_in_fp32_modules
171
95
  )
172
96
 
173
- # while this one is 81ms :)
174
97
  model = replace_with_fp8_linear(
175
98
  model,
176
99
  modules_to_not_convert=self.modules_to_not_convert,
177
100
  quantization_config=self.quantization_config,
101
+ pre_quantized=self.pre_quantized,
178
102
  )
179
103
 
180
- model.config.quantization_config = self.quantization_config
181
-
182
- def update_missing_keys(self, model, missing_keys: list[str], prefix: str) -> list[str]:
183
- from ..integrations import FP8Linear
184
-
185
- not_missing_keys = []
186
- for name, module in model.named_modules():
187
- if isinstance(module, FP8Linear):
188
- for missing in missing_keys:
189
- if (
190
- (name in missing or name in f"{prefix}.{missing}")
191
- and not missing.endswith(".weight")
192
- and not missing.endswith(".bias")
193
- ):
194
- not_missing_keys.append(missing)
195
- return [k for k in missing_keys if k not in not_missing_keys]
196
-
197
104
  # NOTE: TP is applied before quantization so this is only to add hooks.
198
105
  # Quantization is incompatible with DTensors, so we have to anyway have
199
106
  # gathers! But it should be model independant -> figure out where to put
@@ -223,7 +130,7 @@ class FineGrainedFP8HfQuantizer(HfQuantizer):
223
130
 
224
131
  return config
225
132
 
226
- def is_serializable(self, safe_serialization=None):
133
+ def is_serializable(self):
227
134
  return True
228
135
 
229
136
  @property
@@ -246,8 +153,9 @@ class FineGrainedFP8HfQuantizer(HfQuantizer):
246
153
  if self.pre_quantized and self.quantization_config.dequantize:
247
154
  return [
248
155
  # either use the dollar sign, or permute the source patterns to start matching against the scales first
156
+ # We also collect the activation scales, they will not be used
249
157
  WeightConverter(
250
- source_patterns=["weight$", "weight_scale_inv"],
158
+ source_patterns=["weight$", "weight_scale_inv", "activation_scale"],
251
159
  target_patterns="weight",
252
160
  operations=[Fp8Dequantize(self)],
253
161
  )
@@ -36,13 +36,10 @@ class FPQuantHfQuantizer(HfQuantizer):
36
36
  """
37
37
 
38
38
  requires_calibration = False
39
- requires_parameters_quantization = True
40
39
  is_qat_trainable = True
41
- required_packages = ["fp_quant"]
42
40
 
43
41
  def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs):
44
42
  super().__init__(quantization_config, **kwargs)
45
- self.quantization_config = quantization_config
46
43
 
47
44
  def validate_environment(self, device_map, **kwargs):
48
45
  if not torch.cuda.is_available() and not is_torch_xpu_available():
@@ -68,15 +65,17 @@ class FPQuantHfQuantizer(HfQuantizer):
68
65
  "You are attempting to load a FPQuant model without setting device_map."
69
66
  " Please set device_map comprised of 'cuda' devices."
70
67
  )
71
- elif (
72
- isinstance(device_map, dict)
73
- and ("cpu" in device_map.values() or "disk" in device_map.values())
74
- and not self.quantization_config.pseudoquantization
75
- ):
76
- raise ValueError(
77
- "You are attempting to load a FPQuant model with a device_map that contains a CPU or disk device."
78
- " This is not supported. Please remove the CPU or disk device from the device_map."
79
- )
68
+ elif isinstance(device_map, dict):
69
+ if (
70
+ not self.quantization_config.pseudoquantization
71
+ and len(device_map) > 1
72
+ and "cpu" in device_map.values()
73
+ or "disk" in device_map.values()
74
+ ):
75
+ raise ValueError(
76
+ "You are attempting to load a FPQuant model with a device_map that contains a CPU or disk device."
77
+ " This is not supported. Please remove the CPU or disk device from the device_map."
78
+ )
80
79
 
81
80
  def update_dtype(self, dtype: "torch.dtype") -> "torch.dtype":
82
81
  if dtype is None:
@@ -84,50 +83,17 @@ class FPQuantHfQuantizer(HfQuantizer):
84
83
  dtype = torch.bfloat16
85
84
  elif dtype != torch.bfloat16:
86
85
  raise ValueError(f"Invalid `dtype` {dtype}. fp_quant quantization only supports `dtype=torch.bfloat16`.")
87
-
88
86
  return dtype
89
87
 
90
- def create_quantized_param(
91
- self,
92
- model: "PreTrainedModel",
93
- param_value: "torch.Tensor",
94
- param_name: str,
95
- target_device: "torch.device",
96
- **kwargs,
97
- ):
98
- module, _ = get_module_from_name(model, param_name)
99
-
100
- if target_device == "cpu" and param_name.endswith("weight"):
101
- # Works agains hard-coded missing key dispatch to CPU
102
- return
103
-
104
- # The module holds either:
105
- # * `weight` when `store_master_weights=True`
106
- # * `qweight` and `scales` when `store_master_weights=False` and `pseudoquantization=False`
107
- # * `dqweight` when `store_master_weights=False` and `pseudoquantization=True`
108
-
109
- if param_name.endswith(".qweight"):
110
- # Loading a real quantized checkpoint without master weights
111
- module.qweight = torch.nn.Parameter(
112
- param_value.to(target_device),
113
- requires_grad=False,
114
- )
115
- module.weight = None
116
- module.dqweight = None
117
- return
118
-
119
- if param_name.endswith(".dqweight"):
120
- # Loading a pseudo-quantized checkpoint without master weights
121
- module.dqweight = torch.nn.Parameter(param_value.to(target_device))
122
- module.weight = None
123
- module.qweight = None
124
- module.scales = None
125
- return
126
-
127
- # Loading master weights or an unquantized checkpoint
128
- module.weight = torch.nn.Parameter(param_value.to(target_device))
129
- # Let pre-forward handle the quantization and set None where necessary
130
- module.pre_forward()
88
+ def param_needs_quantization(self, model: "PreTrainedModel", param_name: str, **kwargs) -> bool:
89
+ from fp_quant import FPQuantLinear
90
+
91
+ module, tensor_name = get_module_from_name(model, param_name)
92
+ if isinstance(module, FPQuantLinear) and tensor_name in ["weight", "qweight", "dqweight"]:
93
+ # Only quantize weights of FPQuantLinear modules that are not already quantized
94
+ return True
95
+ else:
96
+ return False
131
97
 
132
98
  def _process_model_before_weight_loading(
133
99
  self,
@@ -142,20 +108,6 @@ class FPQuantHfQuantizer(HfQuantizer):
142
108
  model,
143
109
  fp_quant_linear_config=adapt_fp_quant_config(self.quantization_config),
144
110
  )
145
- model.config.quantization_config = self.quantization_config
146
-
147
- def update_missing_keys(self, model, missing_keys: list[str], prefix: str) -> list[str]:
148
- from fp_quant import FPQuantLinear
149
-
150
- fp_quant_names = {name for name, module in model.named_modules() if isinstance(module, FPQuantLinear)}
151
-
152
- def should_exclude(key: str) -> bool:
153
- if key.endswith(".weight") or key.endswith(".bias"):
154
- return False
155
- full_key = f"{prefix}.{key}"
156
- return any(name in key or name in full_key for name in fp_quant_names)
157
-
158
- return [key for key in missing_keys if not should_exclude(key)]
159
111
 
160
112
  @property
161
113
  def is_trainable(self, model: Optional["PreTrainedModel"] = None):
@@ -166,15 +118,33 @@ class FPQuantHfQuantizer(HfQuantizer):
166
118
  )
167
119
  return trainable
168
120
 
169
- def is_serializable(self, safe_serialization=None):
121
+ def is_serializable(self):
170
122
  return True
171
123
 
172
- def param_needs_quantization(self, model: "PreTrainedModel", param_name: str, **kwargs) -> bool:
173
- from fp_quant import FPQuantLinear
174
-
175
- module, tensor_name = get_module_from_name(model, param_name)
176
- if isinstance(module, FPQuantLinear) and tensor_name in ["weight", "qweight", "dqweight"]:
177
- # Only quantize weights of FPQuantLinear modules that are not already quantized
178
- return True
179
- else:
180
- return False
124
+ def get_quantize_ops(self):
125
+ from ..integrations.fp_quant import FpQuantQuantize
126
+
127
+ return FpQuantQuantize(self)
128
+
129
+ def get_weight_conversions(self):
130
+ from ..core_model_loading import WeightConverter
131
+ from ..integrations.fp_quant import FpQuantDeserialize
132
+
133
+ if self.pre_quantized:
134
+ if self.quantization_config.pseudoquantization:
135
+ return [
136
+ WeightConverter(
137
+ source_patterns=[".dqweight"],
138
+ target_patterns=".dqweight",
139
+ operations=[FpQuantDeserialize(self)],
140
+ ),
141
+ ]
142
+ else:
143
+ return [
144
+ WeightConverter(
145
+ source_patterns=[".qweight"],
146
+ target_patterns=".qweight",
147
+ operations=[FpQuantDeserialize(self)],
148
+ ),
149
+ ]
150
+ return []
@@ -22,7 +22,7 @@ from .base import HfQuantizer
22
22
  if TYPE_CHECKING:
23
23
  from ..modeling_utils import PreTrainedModel
24
24
 
25
- from ..utils import is_auto_gptq_available, is_gptqmodel_available, is_optimum_available, is_torch_available, logging
25
+ from ..utils import is_gptqmodel_available, is_optimum_available, is_torch_available, logging
26
26
  from ..utils.quantization_config import GPTQConfig, QuantizationConfigMixin
27
27
 
28
28
 
@@ -35,12 +35,11 @@ logger = logging.get_logger(__name__)
35
35
  class GptqHfQuantizer(HfQuantizer):
36
36
  """
37
37
  Quantizer of the GPTQ method - for GPTQ the quantizer support calibration of the model through
38
- `auto_gptq` or `gptqmodel` package. Quantization is done under the hood for users if they load a non-prequantized model.
38
+ the GPT-QModel package (Python import name `gptqmodel`). Quantization is done under the hood for users if they
39
+ load a non-prequantized model.
39
40
  """
40
41
 
41
42
  requires_calibration = False
42
- required_packages = ["optimum", "auto_gptq", "gptqmodel"]
43
- optimum_quantizer = None
44
43
 
45
44
  def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs):
46
45
  super().__init__(quantization_config, **kwargs)
@@ -54,25 +53,12 @@ class GptqHfQuantizer(HfQuantizer):
54
53
  def validate_environment(self, *args, **kwargs):
55
54
  if not is_optimum_available():
56
55
  raise ImportError("Loading a GPTQ quantized model requires optimum (`pip install optimum`)")
57
- if is_auto_gptq_available() and is_gptqmodel_available():
58
- logger.warning("Detected gptqmodel and auto-gptq, will use gptqmodel")
59
56
 
60
- gptq_supports_cpu = (
61
- is_auto_gptq_available()
62
- and version.parse(importlib.metadata.version("auto-gptq")) > version.parse("0.4.2")
63
- ) or is_gptqmodel_available()
57
+ gptq_supports_cpu = is_gptqmodel_available()
64
58
  if not gptq_supports_cpu and not torch.cuda.is_available():
65
59
  raise RuntimeError("GPU is required to quantize or run quantize model.")
66
- elif not (is_auto_gptq_available() or is_gptqmodel_available()):
67
- raise ImportError(
68
- "Loading a GPTQ quantized model requires gptqmodel (`pip install gptqmodel`) or auto-gptq (`pip install auto-gptq`) library. "
69
- )
70
- elif is_auto_gptq_available() and version.parse(importlib.metadata.version("auto_gptq")) < version.parse(
71
- "0.4.2"
72
- ):
73
- raise ImportError(
74
- "You need a version of auto_gptq >= 0.4.2 to use GPTQ: `pip install --upgrade auto-gptq` or use gptqmodel by `pip install gptqmodel>=1.4.3`."
75
- )
60
+ elif not is_gptqmodel_available():
61
+ raise ImportError("Loading a GPTQ quantized model requires gptqmodel (`pip install gptqmodel`) library.")
76
62
  elif is_gptqmodel_available() and (
77
63
  version.parse(importlib.metadata.version("gptqmodel")) < version.parse("1.4.3")
78
64
  or version.parse(importlib.metadata.version("optimum")) < version.parse("1.23.99")
@@ -90,9 +76,6 @@ class GptqHfQuantizer(HfQuantizer):
90
76
  def update_device_map(self, device_map):
91
77
  if device_map is None:
92
78
  device_map = {"": torch.device("cpu")}
93
- # Only with auto-gptq do not support CPU, we should move the model to cuda if available.
94
- if not is_gptqmodel_available() and device_map in ("cpu", {"": torch.device("cpu")}):
95
- device_map = {"": 0}
96
79
  return device_map
97
80
 
98
81
  def _process_model_before_weight_loading(self, model: "PreTrainedModel", **kwargs):
@@ -120,5 +103,5 @@ class GptqHfQuantizer(HfQuantizer):
120
103
  def is_trainable(self) -> bool:
121
104
  return True
122
105
 
123
- def is_serializable(self, safe_serialization=None):
106
+ def is_serializable(self):
124
107
  return True
@@ -37,12 +37,9 @@ class HiggsHfQuantizer(HfQuantizer):
37
37
  """
38
38
 
39
39
  requires_calibration = False
40
- requires_parameters_quantization = True
41
- required_packages = ["flute-kernel", "fast_hadamard_transform"]
42
40
 
43
41
  def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs):
44
42
  super().__init__(quantization_config, **kwargs)
45
- self.quantization_config = quantization_config
46
43
 
47
44
  def validate_environment(self, device_map, **kwargs):
48
45
  if not torch.cuda.is_available():
@@ -64,11 +61,12 @@ class HiggsHfQuantizer(HfQuantizer):
64
61
  "You are attempting to load a HIGGS model without setting device_map."
65
62
  " Please set device_map comprised of 'cuda' devices."
66
63
  )
67
- elif isinstance(device_map, dict) and ("cpu" in device_map.values() or "disk" in device_map.values()):
68
- raise ValueError(
69
- "You are attempting to load a HIGGS model with a device_map that contains a CPU or disk device."
70
- " This is not supported. Please remove the CPU or disk device from the device_map."
71
- )
64
+ elif isinstance(device_map, dict):
65
+ if "cpu" in device_map.values() or "disk" in device_map.values():
66
+ raise ValueError(
67
+ "You are attempting to load a HIGGS model with a device_map that contains a CPU or disk device."
68
+ " This is not supported. Please remove the CPU or disk device from the device_map."
69
+ )
72
70
 
73
71
  def update_dtype(self, dtype: "torch.dtype") -> "torch.dtype":
74
72
  if dtype is None:
@@ -81,37 +79,39 @@ class HiggsHfQuantizer(HfQuantizer):
81
79
 
82
80
  return dtype
83
81
 
84
- def create_quantized_param(
85
- self,
86
- model: "PreTrainedModel",
87
- param_value: "torch.Tensor",
88
- param_name: str,
89
- target_device: "torch.device",
90
- **kwargs,
91
- ):
92
- from ..integrations import quantize_with_higgs
93
-
94
- flute_dict = quantize_with_higgs(
95
- param_value.to(target_device),
96
- self.quantization_config.bits,
97
- self.quantization_config.p,
98
- self.quantization_config.group_size,
99
- self.quantization_config.hadamard_size,
100
- )
101
- del param_value
102
-
103
- module, _ = get_module_from_name(model, param_name)
104
- module_name = ".".join(param_name.split(".")[:-1])
105
- for key, value in flute_dict.items():
106
- if key in module._parameters:
107
- module._parameters[key] = torch.nn.Parameter(value, requires_grad=False)
108
- elif key in module._buffers:
109
- module._buffers[key] = torch.nn.Buffer(value)
110
- elif key == "tune_metadata":
111
- module.tune_metadata = value
112
- self.quantization_config.tune_metadata[module_name] = value.to_dict()
113
- else:
114
- raise ValueError(f"Unexpected key {key} in module {module}")
82
+ # TODO: to remove
83
+ # Kept here in case we see some interest in adding support for it
84
+ # def create_quantized_param(
85
+ # self,
86
+ # model: "PreTrainedModel",
87
+ # param_value: "torch.Tensor",
88
+ # param_name: str,
89
+ # target_device: "torch.device",
90
+ # **kwargs,
91
+ # ):
92
+ # from ..integrations import quantize_with_higgs
93
+
94
+ # flute_dict = quantize_with_higgs(
95
+ # param_value.to(target_device),
96
+ # self.quantization_config.bits,
97
+ # self.quantization_config.p,
98
+ # self.quantization_config.group_size,
99
+ # self.quantization_config.hadamard_size,
100
+ # )
101
+ # del param_value
102
+
103
+ # module, _ = get_module_from_name(model, param_name)
104
+ # module_name = ".".join(param_name.split(".")[:-1])
105
+ # for key, value in flute_dict.items():
106
+ # if key in module._parameters:
107
+ # module._parameters[key] = torch.nn.Parameter(value, requires_grad=False)
108
+ # elif key in module._buffers:
109
+ # module._buffers[key] = torch.nn.Buffer(value)
110
+ # elif key == "tune_metadata":
111
+ # module.tune_metadata = value
112
+ # self.quantization_config.tune_metadata[module_name] = value.to_dict()
113
+ # else:
114
+ # raise ValueError(f"Unexpected key {key} in module {module}")
115
115
 
116
116
  def _process_model_before_weight_loading(
117
117
  self,
@@ -130,7 +130,6 @@ class HiggsHfQuantizer(HfQuantizer):
130
130
  quantization_config=self.quantization_config,
131
131
  modules_to_not_convert=self.modules_to_not_convert,
132
132
  )
133
- model.config.quantization_config = self.quantization_config
134
133
 
135
134
  def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs):
136
135
  from flute.tune import TuneMetaData, maybe_tune_and_repack
@@ -157,24 +156,11 @@ class HiggsHfQuantizer(HfQuantizer):
157
156
  )
158
157
  self.quantization_config.tune_metadata[name] = module.tune_metadata.to_dict()
159
158
 
160
- def update_missing_keys(self, model, missing_keys: list[str], prefix: str) -> list[str]:
161
- from ..integrations import HiggsLinear
162
-
163
- higgs_names = {name for name, module in model.named_modules() if isinstance(module, HiggsLinear)}
164
-
165
- def should_update(key: str) -> bool:
166
- if key.endswith(".weight") or key.endswith(".bias"):
167
- return False
168
- full_key = f"{prefix}.{key}"
169
- return any(name in key or name in full_key for name in higgs_names)
170
-
171
- return [key for key in missing_keys if not should_update(key)]
172
-
173
159
  @property
174
160
  def is_trainable(self) -> bool:
175
161
  return False
176
162
 
177
- def is_serializable(self, safe_serialization=None):
163
+ def is_serializable(self):
178
164
  return True
179
165
 
180
166
  def param_needs_quantization(self, model: "PreTrainedModel", param_name: str, **kwargs) -> bool: