transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (539) hide show
  1. transformers/__init__.py +30 -3
  2. transformers/cli/serve.py +47 -17
  3. transformers/conversion_mapping.py +15 -2
  4. transformers/convert_slow_tokenizer.py +225 -10
  5. transformers/core_model_loading.py +196 -135
  6. transformers/data/data_collator.py +12 -4
  7. transformers/dependency_versions_table.py +1 -2
  8. transformers/dynamic_module_utils.py +1 -2
  9. transformers/feature_extraction_utils.py +1 -2
  10. transformers/file_utils.py +0 -1
  11. transformers/generation/__init__.py +11 -1
  12. transformers/generation/configuration_utils.py +3 -2
  13. transformers/generation/continuous_batching/__init__.py +4 -0
  14. transformers/generation/continuous_batching/continuous_api.py +134 -79
  15. transformers/image_processing_base.py +1 -2
  16. transformers/integrations/__init__.py +4 -2
  17. transformers/integrations/accelerate.py +15 -3
  18. transformers/integrations/aqlm.py +38 -66
  19. transformers/integrations/awq.py +48 -514
  20. transformers/integrations/bitnet.py +45 -100
  21. transformers/integrations/bitsandbytes.py +79 -191
  22. transformers/integrations/deepspeed.py +1 -0
  23. transformers/integrations/eetq.py +84 -79
  24. transformers/integrations/fbgemm_fp8.py +191 -145
  25. transformers/integrations/finegrained_fp8.py +236 -193
  26. transformers/integrations/fp_quant.py +92 -0
  27. transformers/integrations/ggml.py +11 -1
  28. transformers/integrations/higgs.py +40 -62
  29. transformers/integrations/hub_kernels.py +42 -3
  30. transformers/integrations/integration_utils.py +10 -0
  31. transformers/integrations/mxfp4.py +25 -65
  32. transformers/integrations/peft.py +7 -29
  33. transformers/integrations/quanto.py +73 -55
  34. transformers/integrations/quark.py +55 -0
  35. transformers/integrations/spqr.py +44 -90
  36. transformers/integrations/torchao.py +32 -38
  37. transformers/integrations/vptq.py +42 -59
  38. transformers/modelcard.py +1 -2
  39. transformers/modeling_gguf_pytorch_utils.py +8 -0
  40. transformers/modeling_rope_utils.py +30 -6
  41. transformers/modeling_utils.py +116 -112
  42. transformers/models/__init__.py +3 -0
  43. transformers/models/afmoe/modeling_afmoe.py +4 -4
  44. transformers/models/albert/tokenization_albert.py +6 -12
  45. transformers/models/align/modeling_align.py +2 -0
  46. transformers/models/altclip/modeling_altclip.py +4 -0
  47. transformers/models/apertus/modeling_apertus.py +4 -4
  48. transformers/models/arcee/modeling_arcee.py +4 -4
  49. transformers/models/aria/modeling_aria.py +4 -4
  50. transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
  51. transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
  52. transformers/models/auto/configuration_auto.py +11 -0
  53. transformers/models/auto/feature_extraction_auto.py +2 -0
  54. transformers/models/auto/image_processing_auto.py +1 -0
  55. transformers/models/auto/modeling_auto.py +6 -0
  56. transformers/models/auto/processing_auto.py +18 -10
  57. transformers/models/auto/tokenization_auto.py +74 -472
  58. transformers/models/autoformer/modeling_autoformer.py +4 -0
  59. transformers/models/bamba/modeling_bamba.py +4 -3
  60. transformers/models/bark/modeling_bark.py +2 -0
  61. transformers/models/bart/modeling_bart.py +7 -0
  62. transformers/models/barthez/tokenization_barthez.py +5 -10
  63. transformers/models/beit/modeling_beit.py +6 -1
  64. transformers/models/bert/tokenization_bert.py +8 -21
  65. transformers/models/big_bird/modeling_big_bird.py +6 -0
  66. transformers/models/big_bird/tokenization_big_bird.py +18 -42
  67. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +8 -2
  68. transformers/models/biogpt/modeling_biogpt.py +2 -0
  69. transformers/models/biogpt/modular_biogpt.py +2 -0
  70. transformers/models/bit/modeling_bit.py +11 -2
  71. transformers/models/bitnet/modeling_bitnet.py +4 -4
  72. transformers/models/blenderbot/modeling_blenderbot.py +5 -0
  73. transformers/models/blenderbot/tokenization_blenderbot.py +12 -16
  74. transformers/models/blenderbot_small/modeling_blenderbot_small.py +5 -0
  75. transformers/models/blip/modeling_blip_text.py +2 -0
  76. transformers/models/blip_2/modeling_blip_2.py +2 -1
  77. transformers/models/bloom/modeling_bloom.py +4 -0
  78. transformers/models/blt/modeling_blt.py +2 -2
  79. transformers/models/blt/modular_blt.py +2 -2
  80. transformers/models/bridgetower/modeling_bridgetower.py +5 -1
  81. transformers/models/bros/modeling_bros.py +4 -0
  82. transformers/models/camembert/tokenization_camembert.py +8 -12
  83. transformers/models/canine/modeling_canine.py +5 -0
  84. transformers/models/chameleon/modeling_chameleon.py +2 -1
  85. transformers/models/chinese_clip/modeling_chinese_clip.py +3 -0
  86. transformers/models/clap/modeling_clap.py +5 -0
  87. transformers/models/clip/tokenization_clip.py +22 -44
  88. transformers/models/clipseg/modeling_clipseg.py +5 -0
  89. transformers/models/clvp/modeling_clvp.py +5 -0
  90. transformers/models/clvp/tokenization_clvp.py +1 -63
  91. transformers/models/code_llama/tokenization_code_llama.py +20 -43
  92. transformers/models/codegen/tokenization_codegen.py +14 -43
  93. transformers/models/cohere/modeling_cohere.py +4 -3
  94. transformers/models/cohere/modular_cohere.py +2 -1
  95. transformers/models/cohere/tokenization_cohere.py +12 -42
  96. transformers/models/cohere2/modeling_cohere2.py +7 -6
  97. transformers/models/cohere2/modular_cohere2.py +5 -5
  98. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -3
  99. transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
  100. transformers/models/colqwen2/modeling_colqwen2.py +1 -0
  101. transformers/models/colqwen2/modular_colqwen2.py +1 -0
  102. transformers/models/conditional_detr/modeling_conditional_detr.py +5 -0
  103. transformers/models/convbert/modeling_convbert.py +6 -0
  104. transformers/models/convnext/modeling_convnext.py +2 -4
  105. transformers/models/convnextv2/modeling_convnextv2.py +2 -4
  106. transformers/models/csm/modeling_csm.py +4 -3
  107. transformers/models/ctrl/modeling_ctrl.py +1 -0
  108. transformers/models/cvt/modeling_cvt.py +2 -0
  109. transformers/models/cwm/modeling_cwm.py +4 -4
  110. transformers/models/d_fine/modeling_d_fine.py +2 -0
  111. transformers/models/d_fine/modular_d_fine.py +1 -0
  112. transformers/models/dab_detr/modeling_dab_detr.py +4 -0
  113. transformers/models/dac/modeling_dac.py +2 -2
  114. transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
  115. transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
  116. transformers/models/dbrx/modeling_dbrx.py +2 -2
  117. transformers/models/deberta/modeling_deberta.py +5 -0
  118. transformers/models/deberta/tokenization_deberta.py +11 -20
  119. transformers/models/deberta_v2/modeling_deberta_v2.py +6 -0
  120. transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
  121. transformers/models/decision_transformer/modeling_decision_transformer.py +4 -1
  122. transformers/models/deepseek_v2/modeling_deepseek_v2.py +2 -3
  123. transformers/models/deepseek_v2/modular_deepseek_v2.py +2 -2
  124. transformers/models/deepseek_v3/modeling_deepseek_v3.py +3 -2
  125. transformers/models/deepseek_v3/modular_deepseek_v3.py +1 -0
  126. transformers/models/deformable_detr/modeling_deformable_detr.py +4 -0
  127. transformers/models/depth_anything/modeling_depth_anything.py +1 -0
  128. transformers/models/depth_pro/modeling_depth_pro.py +2 -0
  129. transformers/models/detr/modeling_detr.py +5 -0
  130. transformers/models/dia/modeling_dia.py +4 -3
  131. transformers/models/dia/modular_dia.py +0 -1
  132. transformers/models/diffllama/modeling_diffllama.py +2 -2
  133. transformers/models/dinat/modeling_dinat.py +3 -0
  134. transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
  135. transformers/models/dinov3_vit/modeling_dinov3_vit.py +2 -2
  136. transformers/models/dinov3_vit/modular_dinov3_vit.py +2 -2
  137. transformers/models/distilbert/tokenization_distilbert.py +13 -0
  138. transformers/models/doge/modeling_doge.py +2 -3
  139. transformers/models/doge/modular_doge.py +0 -1
  140. transformers/models/donut/modeling_donut_swin.py +2 -0
  141. transformers/models/dots1/modeling_dots1.py +10 -7
  142. transformers/models/dots1/modular_dots1.py +5 -3
  143. transformers/models/dpr/modeling_dpr.py +5 -0
  144. transformers/models/dpr/tokenization_dpr.py +12 -0
  145. transformers/models/edgetam/modeling_edgetam.py +1 -1
  146. transformers/models/edgetam_video/modeling_edgetam_video.py +1 -0
  147. transformers/models/edgetam_video/modular_edgetam_video.py +1 -0
  148. transformers/models/efficientloftr/modeling_efficientloftr.py +2 -2
  149. transformers/models/efficientnet/modeling_efficientnet.py +2 -0
  150. transformers/models/emu3/modeling_emu3.py +4 -4
  151. transformers/models/eomt/image_processing_eomt.py +13 -1
  152. transformers/models/eomt/image_processing_eomt_fast.py +14 -2
  153. transformers/models/ernie4_5/modeling_ernie4_5.py +4 -4
  154. transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
  155. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +5 -5
  156. transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +2 -2
  157. transformers/models/esm/modeling_esmfold.py +5 -4
  158. transformers/models/evolla/modeling_evolla.py +4 -4
  159. transformers/models/exaone4/modeling_exaone4.py +2 -2
  160. transformers/models/exaone4/modular_exaone4.py +0 -1
  161. transformers/models/falcon/modeling_falcon.py +6 -1
  162. transformers/models/falcon_h1/modeling_falcon_h1.py +4 -3
  163. transformers/models/falcon_mamba/modeling_falcon_mamba.py +25 -35
  164. transformers/models/falcon_mamba/modular_falcon_mamba.py +12 -31
  165. transformers/{kernels/falcon_mamba → models/fast_vlm}/__init__.py +15 -3
  166. transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
  167. transformers/models/fast_vlm/modeling_fast_vlm.py +455 -0
  168. transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
  169. transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +8 -3
  170. transformers/models/flaubert/modeling_flaubert.py +7 -0
  171. transformers/models/flava/modeling_flava.py +6 -1
  172. transformers/models/flex_olmo/modeling_flex_olmo.py +4 -5
  173. transformers/models/florence2/modeling_florence2.py +2 -1
  174. transformers/models/florence2/modular_florence2.py +2 -1
  175. transformers/models/fnet/modeling_fnet.py +7 -0
  176. transformers/models/focalnet/modeling_focalnet.py +4 -0
  177. transformers/models/fsmt/modeling_fsmt.py +2 -0
  178. transformers/models/funnel/modeling_funnel.py +8 -0
  179. transformers/models/funnel/tokenization_funnel.py +17 -24
  180. transformers/models/fuyu/processing_fuyu.py +3 -3
  181. transformers/models/gemma/modeling_gemma.py +4 -4
  182. transformers/models/gemma/tokenization_gemma.py +10 -27
  183. transformers/models/gemma2/modeling_gemma2.py +4 -4
  184. transformers/models/gemma2/modular_gemma2.py +2 -1
  185. transformers/models/gemma3/modeling_gemma3.py +14 -84
  186. transformers/models/gemma3/modular_gemma3.py +12 -81
  187. transformers/models/gemma3n/modeling_gemma3n.py +18 -209
  188. transformers/models/gemma3n/modular_gemma3n.py +17 -59
  189. transformers/models/git/modeling_git.py +2 -0
  190. transformers/models/glm/modeling_glm.py +4 -4
  191. transformers/models/glm4/modeling_glm4.py +4 -4
  192. transformers/models/glm4_moe/modeling_glm4_moe.py +5 -3
  193. transformers/models/glm4v/configuration_glm4v.py +3 -1
  194. transformers/models/glm4v/modeling_glm4v.py +3 -3
  195. transformers/models/glm4v/modular_glm4v.py +6 -4
  196. transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
  197. transformers/models/glm4v_moe/modeling_glm4v_moe.py +6 -5
  198. transformers/models/glm4v_moe/modular_glm4v_moe.py +1 -1
  199. transformers/models/glpn/modeling_glpn.py +2 -0
  200. transformers/models/gpt2/modeling_gpt2.py +5 -1
  201. transformers/models/gpt2/tokenization_gpt2.py +16 -44
  202. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +1 -0
  203. transformers/models/gpt_neo/modeling_gpt_neo.py +4 -0
  204. transformers/models/gpt_neox/modeling_gpt_neox.py +5 -2
  205. transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
  206. transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
  207. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +3 -1
  208. transformers/models/gpt_oss/modeling_gpt_oss.py +5 -6
  209. transformers/models/gpt_oss/modular_gpt_oss.py +3 -5
  210. transformers/models/gptj/modeling_gptj.py +3 -0
  211. transformers/models/granite/modeling_granite.py +4 -4
  212. transformers/models/granitemoe/modeling_granitemoe.py +4 -6
  213. transformers/models/granitemoe/modular_granitemoe.py +0 -2
  214. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +4 -6
  215. transformers/models/granitemoeshared/modeling_granitemoeshared.py +4 -6
  216. transformers/models/grounding_dino/modeling_grounding_dino.py +4 -0
  217. transformers/models/groupvit/modeling_groupvit.py +3 -0
  218. transformers/models/helium/modeling_helium.py +4 -3
  219. transformers/models/herbert/tokenization_herbert.py +9 -25
  220. transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -1
  221. transformers/models/hgnet_v2/modular_hgnet_v2.py +6 -1
  222. transformers/models/hiera/modeling_hiera.py +4 -0
  223. transformers/models/hubert/modeling_hubert.py +3 -0
  224. transformers/models/hubert/modular_hubert.py +1 -0
  225. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +4 -4
  226. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +4 -4
  227. transformers/models/ibert/modeling_ibert.py +6 -0
  228. transformers/models/idefics/modeling_idefics.py +5 -21
  229. transformers/models/imagegpt/modeling_imagegpt.py +2 -1
  230. transformers/models/informer/modeling_informer.py +4 -0
  231. transformers/models/informer/modular_informer.py +1 -0
  232. transformers/models/internvl/modeling_internvl.py +2 -4
  233. transformers/models/internvl/modular_internvl.py +2 -4
  234. transformers/models/jamba/modeling_jamba.py +2 -2
  235. transformers/models/janus/modeling_janus.py +1 -0
  236. transformers/models/janus/modular_janus.py +1 -0
  237. transformers/models/jetmoe/modeling_jetmoe.py +2 -2
  238. transformers/models/kosmos2/modeling_kosmos2.py +1 -0
  239. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +3 -1
  240. transformers/models/lasr/__init__.py +29 -0
  241. transformers/models/lasr/configuration_lasr.py +244 -0
  242. transformers/models/lasr/feature_extraction_lasr.py +277 -0
  243. transformers/models/lasr/modeling_lasr.py +729 -0
  244. transformers/models/lasr/modular_lasr.py +569 -0
  245. transformers/models/lasr/processing_lasr.py +96 -0
  246. transformers/models/lasr/tokenization_lasr.py +186 -0
  247. transformers/models/layoutlm/modeling_layoutlm.py +5 -0
  248. transformers/models/layoutlmv2/modeling_layoutlmv2.py +4 -0
  249. transformers/models/layoutlmv2/tokenization_layoutlmv2.py +10 -53
  250. transformers/models/layoutlmv3/modeling_layoutlmv3.py +4 -0
  251. transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
  252. transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
  253. transformers/models/led/modeling_led.py +6 -0
  254. transformers/models/levit/modeling_levit.py +3 -0
  255. transformers/models/lfm2/modeling_lfm2.py +4 -5
  256. transformers/models/lfm2/modular_lfm2.py +0 -1
  257. transformers/models/lfm2_moe/modeling_lfm2_moe.py +4 -5
  258. transformers/models/lightglue/modeling_lightglue.py +3 -1
  259. transformers/models/lightglue/modular_lightglue.py +1 -0
  260. transformers/models/lilt/modeling_lilt.py +4 -0
  261. transformers/models/llama/modeling_llama.py +4 -4
  262. transformers/models/llama/tokenization_llama.py +15 -43
  263. transformers/models/llama4/modeling_llama4.py +3 -2
  264. transformers/models/longcat_flash/modeling_longcat_flash.py +4 -4
  265. transformers/models/longcat_flash/modular_longcat_flash.py +2 -2
  266. transformers/models/longformer/modeling_longformer.py +6 -0
  267. transformers/models/longt5/modeling_longt5.py +4 -0
  268. transformers/models/luke/modeling_luke.py +9 -0
  269. transformers/models/luke/tokenization_luke.py +11 -38
  270. transformers/models/lxmert/modeling_lxmert.py +2 -0
  271. transformers/models/m2m_100/modeling_m2m_100.py +4 -0
  272. transformers/models/mamba/modeling_mamba.py +14 -22
  273. transformers/models/marian/modeling_marian.py +5 -0
  274. transformers/models/markuplm/modeling_markuplm.py +4 -0
  275. transformers/models/markuplm/tokenization_markuplm.py +28 -61
  276. transformers/models/mask2former/modeling_mask2former.py +2 -0
  277. transformers/models/maskformer/modeling_maskformer.py +2 -0
  278. transformers/models/maskformer/modeling_maskformer_swin.py +2 -0
  279. transformers/models/mbart/modeling_mbart.py +7 -0
  280. transformers/models/mbart/tokenization_mbart.py +11 -52
  281. transformers/models/mbart50/tokenization_mbart50.py +7 -10
  282. transformers/models/megatron_bert/modeling_megatron_bert.py +7 -0
  283. transformers/models/mgp_str/modeling_mgp_str.py +2 -0
  284. transformers/models/mimi/modeling_mimi.py +3 -1
  285. transformers/models/minimax/modeling_minimax.py +4 -4
  286. transformers/models/ministral/modeling_ministral.py +4 -4
  287. transformers/models/ministral3/configuration_ministral3.py +1 -1
  288. transformers/models/ministral3/modeling_ministral3.py +4 -3
  289. transformers/models/mistral/modeling_mistral.py +4 -3
  290. transformers/models/mixtral/modeling_mixtral.py +4 -4
  291. transformers/models/mllama/modeling_mllama.py +2 -2
  292. transformers/models/mluke/tokenization_mluke.py +6 -6
  293. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +4 -0
  294. transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
  295. transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
  296. transformers/models/mobilevit/modeling_mobilevit.py +3 -0
  297. transformers/models/mobilevitv2/modeling_mobilevitv2.py +3 -0
  298. transformers/models/modernbert/modeling_modernbert.py +4 -1
  299. transformers/models/modernbert/modular_modernbert.py +2 -0
  300. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +8 -9
  301. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +6 -7
  302. transformers/models/moonshine/modeling_moonshine.py +4 -2
  303. transformers/models/moshi/modeling_moshi.py +5 -2
  304. transformers/models/mpnet/modeling_mpnet.py +5 -0
  305. transformers/models/mpnet/tokenization_mpnet.py +5 -13
  306. transformers/models/mpt/modeling_mpt.py +2 -0
  307. transformers/models/mra/modeling_mra.py +6 -0
  308. transformers/models/mt5/modeling_mt5.py +7 -0
  309. transformers/models/musicgen/modeling_musicgen.py +2 -0
  310. transformers/models/musicgen_melody/modeling_musicgen_melody.py +3 -0
  311. transformers/models/mvp/modeling_mvp.py +7 -0
  312. transformers/models/nanochat/modeling_nanochat.py +4 -4
  313. transformers/models/nemotron/modeling_nemotron.py +4 -2
  314. transformers/models/nllb/tokenization_nllb.py +8 -22
  315. transformers/models/nougat/tokenization_nougat.py +11 -59
  316. transformers/models/nystromformer/modeling_nystromformer.py +6 -0
  317. transformers/models/olmo/modeling_olmo.py +4 -4
  318. transformers/models/olmo/modular_olmo.py +2 -2
  319. transformers/models/olmo2/modeling_olmo2.py +4 -5
  320. transformers/models/olmo2/modular_olmo2.py +0 -1
  321. transformers/models/olmo3/modeling_olmo3.py +4 -4
  322. transformers/models/olmoe/modeling_olmoe.py +4 -4
  323. transformers/models/omdet_turbo/modeling_omdet_turbo.py +2 -0
  324. transformers/models/oneformer/modeling_oneformer.py +4 -1
  325. transformers/models/openai/modeling_openai.py +3 -0
  326. transformers/models/openai/tokenization_openai.py +10 -46
  327. transformers/models/opt/modeling_opt.py +2 -0
  328. transformers/models/owlv2/modeling_owlv2.py +4 -0
  329. transformers/models/owlvit/modeling_owlvit.py +4 -0
  330. transformers/models/paddleocr_vl/__init__.py +32 -0
  331. transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
  332. transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +503 -0
  333. transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
  334. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1668 -0
  335. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1349 -0
  336. transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
  337. transformers/models/parakeet/configuration_parakeet.py +4 -6
  338. transformers/models/parakeet/modeling_parakeet.py +9 -6
  339. transformers/models/parakeet/modular_parakeet.py +2 -2
  340. transformers/models/parakeet/processing_parakeet.py +1 -0
  341. transformers/models/patchtsmixer/modeling_patchtsmixer.py +6 -0
  342. transformers/models/patchtst/modeling_patchtst.py +20 -2
  343. transformers/models/pegasus/modeling_pegasus.py +5 -0
  344. transformers/models/pegasus/tokenization_pegasus.py +17 -44
  345. transformers/models/pegasus_x/modeling_pegasus_x.py +4 -0
  346. transformers/models/perceiver/modeling_perceiver.py +8 -0
  347. transformers/models/persimmon/modeling_persimmon.py +2 -1
  348. transformers/models/phi/modeling_phi.py +4 -5
  349. transformers/models/phi/modular_phi.py +0 -1
  350. transformers/models/phi3/modeling_phi3.py +2 -1
  351. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +5 -5
  352. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +4 -4
  353. transformers/models/phimoe/modeling_phimoe.py +4 -4
  354. transformers/models/phimoe/modular_phimoe.py +2 -2
  355. transformers/models/pix2struct/modeling_pix2struct.py +2 -0
  356. transformers/models/pixtral/modeling_pixtral.py +2 -1
  357. transformers/models/plbart/modeling_plbart.py +6 -0
  358. transformers/models/plbart/modular_plbart.py +2 -0
  359. transformers/models/plbart/tokenization_plbart.py +0 -2
  360. transformers/models/poolformer/modeling_poolformer.py +2 -0
  361. transformers/models/pop2piano/modeling_pop2piano.py +2 -0
  362. transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
  363. transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
  364. transformers/models/prophetnet/modeling_prophetnet.py +3 -0
  365. transformers/models/pvt/modeling_pvt.py +2 -0
  366. transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
  367. transformers/models/qwen2/modeling_qwen2.py +4 -4
  368. transformers/models/qwen2/tokenization_qwen2.py +14 -18
  369. transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
  370. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +13 -16
  371. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +14 -16
  372. transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
  373. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +5 -6
  374. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +3 -5
  375. transformers/models/qwen2_audio/modeling_qwen2_audio.py +2 -0
  376. transformers/models/qwen2_moe/modeling_qwen2_moe.py +4 -4
  377. transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
  378. transformers/models/qwen2_vl/modeling_qwen2_vl.py +6 -16
  379. transformers/models/qwen3/modeling_qwen3.py +4 -4
  380. transformers/models/qwen3_moe/modeling_qwen3_moe.py +4 -4
  381. transformers/models/qwen3_next/modeling_qwen3_next.py +4 -3
  382. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +21 -23
  383. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +14 -16
  384. transformers/models/qwen3_vl/modeling_qwen3_vl.py +39 -37
  385. transformers/models/qwen3_vl/modular_qwen3_vl.py +37 -35
  386. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +39 -37
  387. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +4 -1
  388. transformers/models/rag/modeling_rag.py +1 -0
  389. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +15 -1
  390. transformers/models/reformer/modeling_reformer.py +4 -0
  391. transformers/models/reformer/tokenization_reformer.py +11 -28
  392. transformers/models/regnet/modeling_regnet.py +6 -1
  393. transformers/models/rembert/modeling_rembert.py +6 -0
  394. transformers/models/rembert/tokenization_rembert.py +3 -10
  395. transformers/models/resnet/modeling_resnet.py +11 -2
  396. transformers/models/roberta/tokenization_roberta.py +18 -27
  397. transformers/models/roformer/modeling_roformer.py +6 -0
  398. transformers/models/roformer/tokenization_roformer.py +77 -412
  399. transformers/models/rt_detr/modeling_rt_detr.py +2 -0
  400. transformers/models/rt_detr/modeling_rt_detr_resnet.py +5 -1
  401. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +2 -0
  402. transformers/models/rwkv/modeling_rwkv.py +1 -0
  403. transformers/models/sam2/modeling_sam2.py +2 -2
  404. transformers/models/sam2/modular_sam2.py +2 -2
  405. transformers/models/sam2_video/modeling_sam2_video.py +1 -0
  406. transformers/models/sam2_video/modular_sam2_video.py +1 -0
  407. transformers/models/sam3/modeling_sam3.py +77 -80
  408. transformers/models/sam3_tracker/modeling_sam3_tracker.py +6 -1
  409. transformers/models/sam3_tracker/modular_sam3_tracker.py +6 -1
  410. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +1 -0
  411. transformers/models/sam3_video/modeling_sam3_video.py +1 -0
  412. transformers/models/seamless_m4t/modeling_seamless_m4t.py +5 -1
  413. transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
  414. transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +5 -1
  415. transformers/models/seed_oss/modeling_seed_oss.py +2 -2
  416. transformers/models/segformer/modeling_segformer.py +4 -1
  417. transformers/models/seggpt/modeling_seggpt.py +2 -0
  418. transformers/models/sew/modeling_sew.py +3 -0
  419. transformers/models/sew/modular_sew.py +1 -0
  420. transformers/models/sew_d/modeling_sew_d.py +3 -0
  421. transformers/models/siglip2/modeling_siglip2.py +4 -0
  422. transformers/models/siglip2/modular_siglip2.py +4 -0
  423. transformers/models/smollm3/modeling_smollm3.py +4 -4
  424. transformers/models/smolvlm/processing_smolvlm.py +0 -7
  425. transformers/models/speech_to_text/modeling_speech_to_text.py +4 -0
  426. transformers/models/speecht5/modeling_speecht5.py +13 -1
  427. transformers/models/splinter/modeling_splinter.py +3 -0
  428. transformers/models/splinter/tokenization_splinter.py +9 -28
  429. transformers/models/squeezebert/modeling_squeezebert.py +6 -0
  430. transformers/models/stablelm/modeling_stablelm.py +3 -1
  431. transformers/models/starcoder2/modeling_starcoder2.py +4 -3
  432. transformers/models/superglue/modeling_superglue.py +1 -0
  433. transformers/models/superpoint/modeling_superpoint.py +1 -0
  434. transformers/models/swiftformer/modeling_swiftformer.py +2 -0
  435. transformers/models/swin/modeling_swin.py +4 -0
  436. transformers/models/swin2sr/modeling_swin2sr.py +2 -0
  437. transformers/models/swinv2/modeling_swinv2.py +4 -0
  438. transformers/models/t5/modeling_t5.py +7 -0
  439. transformers/models/t5/tokenization_t5.py +4 -8
  440. transformers/models/t5gemma/modeling_t5gemma.py +5 -5
  441. transformers/models/t5gemma2/modeling_t5gemma2.py +6 -6
  442. transformers/models/table_transformer/modeling_table_transformer.py +4 -0
  443. transformers/models/tapas/modeling_tapas.py +3 -0
  444. transformers/models/textnet/modeling_textnet.py +11 -2
  445. transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
  446. transformers/models/timesfm/modeling_timesfm.py +2 -0
  447. transformers/models/timesfm/modular_timesfm.py +2 -0
  448. transformers/models/timesformer/modeling_timesformer.py +2 -0
  449. transformers/models/timm_wrapper/modeling_timm_wrapper.py +1 -1
  450. transformers/models/trocr/modeling_trocr.py +2 -0
  451. transformers/models/tvp/modeling_tvp.py +2 -0
  452. transformers/models/udop/modeling_udop.py +4 -0
  453. transformers/models/udop/tokenization_udop.py +5 -13
  454. transformers/models/umt5/modeling_umt5.py +7 -0
  455. transformers/models/unispeech/modeling_unispeech.py +4 -0
  456. transformers/models/unispeech/modular_unispeech.py +2 -0
  457. transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
  458. transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
  459. transformers/models/univnet/modeling_univnet.py +1 -0
  460. transformers/models/upernet/modeling_upernet.py +1 -0
  461. transformers/models/vaultgemma/modeling_vaultgemma.py +4 -4
  462. transformers/models/vilt/modeling_vilt.py +6 -0
  463. transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
  464. transformers/models/visual_bert/modeling_visual_bert.py +6 -0
  465. transformers/models/vitdet/modeling_vitdet.py +2 -0
  466. transformers/models/vitmatte/modeling_vitmatte.py +1 -0
  467. transformers/models/vits/modeling_vits.py +1 -0
  468. transformers/models/vjepa2/modeling_vjepa2.py +1 -0
  469. transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
  470. transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +5 -0
  471. transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +5 -0
  472. transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +6 -0
  473. transformers/models/wavlm/modeling_wavlm.py +5 -0
  474. transformers/models/whisper/modeling_whisper.py +6 -0
  475. transformers/models/whisper/tokenization_whisper.py +4 -15
  476. transformers/models/x_clip/modeling_x_clip.py +3 -0
  477. transformers/models/xglm/modeling_xglm.py +1 -0
  478. transformers/models/xglm/tokenization_xglm.py +4 -9
  479. transformers/models/xlm/modeling_xlm.py +5 -0
  480. transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
  481. transformers/models/xlnet/tokenization_xlnet.py +3 -7
  482. transformers/models/yoso/modeling_yoso.py +6 -0
  483. transformers/models/zamba/modeling_zamba.py +2 -0
  484. transformers/models/zamba2/modeling_zamba2.py +4 -2
  485. transformers/models/zamba2/modular_zamba2.py +1 -1
  486. transformers/models/zoedepth/modeling_zoedepth.py +1 -0
  487. transformers/pipelines/__init__.py +2 -3
  488. transformers/pipelines/base.py +1 -9
  489. transformers/pipelines/document_question_answering.py +3 -1
  490. transformers/pipelines/text_generation.py +1 -1
  491. transformers/processing_utils.py +23 -11
  492. transformers/quantizers/base.py +35 -110
  493. transformers/quantizers/quantizer_aqlm.py +1 -5
  494. transformers/quantizers/quantizer_auto_round.py +1 -2
  495. transformers/quantizers/quantizer_awq.py +17 -81
  496. transformers/quantizers/quantizer_bitnet.py +3 -8
  497. transformers/quantizers/quantizer_bnb_4bit.py +13 -110
  498. transformers/quantizers/quantizer_bnb_8bit.py +16 -92
  499. transformers/quantizers/quantizer_compressed_tensors.py +1 -5
  500. transformers/quantizers/quantizer_eetq.py +14 -62
  501. transformers/quantizers/quantizer_fbgemm_fp8.py +34 -125
  502. transformers/quantizers/quantizer_finegrained_fp8.py +13 -105
  503. transformers/quantizers/quantizer_fp_quant.py +48 -78
  504. transformers/quantizers/quantizer_gptq.py +7 -24
  505. transformers/quantizers/quantizer_higgs.py +40 -54
  506. transformers/quantizers/quantizer_hqq.py +144 -153
  507. transformers/quantizers/quantizer_mxfp4.py +13 -167
  508. transformers/quantizers/quantizer_quanto.py +20 -64
  509. transformers/quantizers/quantizer_quark.py +36 -17
  510. transformers/quantizers/quantizer_spqr.py +1 -4
  511. transformers/quantizers/quantizer_torchao.py +23 -202
  512. transformers/quantizers/quantizer_vptq.py +8 -22
  513. transformers/quantizers/quantizers_utils.py +20 -0
  514. transformers/testing_utils.py +297 -36
  515. transformers/tokenization_mistral_common.py +4 -0
  516. transformers/tokenization_utils_base.py +113 -222
  517. transformers/tokenization_utils_tokenizers.py +168 -107
  518. transformers/trainer.py +28 -31
  519. transformers/trainer_jit_checkpoint.py +126 -0
  520. transformers/trainer_utils.py +1 -1
  521. transformers/training_args.py +66 -28
  522. transformers/utils/__init__.py +3 -4
  523. transformers/utils/auto_docstring.py +1 -0
  524. transformers/utils/generic.py +27 -1
  525. transformers/utils/hub.py +5 -15
  526. transformers/utils/import_utils.py +61 -16
  527. transformers/utils/kernel_config.py +4 -2
  528. transformers/utils/loading_report.py +19 -10
  529. transformers/utils/quantization_config.py +75 -242
  530. transformers/video_processing_utils.py +1 -2
  531. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/METADATA +274 -227
  532. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/RECORD +536 -520
  533. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/WHEEL +1 -1
  534. transformers/kernels/__init__.py +0 -0
  535. transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
  536. transformers/models/roformer/tokenization_roformer_fast.py +0 -160
  537. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/entry_points.txt +0 -0
  538. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info/licenses}/LICENSE +0 -0
  539. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/top_level.txt +0 -0
@@ -28,16 +28,13 @@ from typing import Any, Optional, Union
28
28
  from packaging import version
29
29
 
30
30
  from ..utils import (
31
- is_auto_awq_available,
32
31
  is_compressed_tensors_available,
33
- is_gptqmodel_available,
34
32
  is_hqq_available,
35
33
  is_quark_available,
36
34
  is_torch_available,
37
35
  is_torchao_available,
38
36
  logging,
39
37
  )
40
- from .import_utils import is_auto_gptq_available
41
38
 
42
39
 
43
40
  if is_torch_available():
@@ -68,30 +65,26 @@ class QuantizationMethod(str, Enum):
68
65
  MXFP4 = "mxfp4"
69
66
 
70
67
 
71
- class AWQLinearVersion(str, Enum):
68
+ class AwqFormat(str, Enum):
72
69
  GEMM = "gemm"
73
70
  GEMV = "gemv"
74
- EXLLAMA = "exllama"
75
- IPEX = "ipex"
71
+ GEMV_FAST = "gemv_fast"
76
72
 
77
- @staticmethod
78
- def from_str(version: str):
79
- version = version.lower()
80
- if version == "gemm":
81
- return AWQLinearVersion.GEMM
82
- elif version == "gemv":
83
- return AWQLinearVersion.GEMV
84
- elif version == "exllama":
85
- return AWQLinearVersion.EXLLAMA
86
- elif version == "ipex":
87
- return AWQLinearVersion.IPEX
88
- else:
89
- raise ValueError(f"Unknown AWQLinearVersion {version}")
90
73
 
91
-
92
- class AwqBackendPackingMethod(str, Enum):
93
- AUTOAWQ = "autoawq"
94
- LLMAWQ = "llm-awq"
74
+ class AwqBackend(str, Enum):
75
+ LEGACY_AWQ = "autoawq"
76
+ AUTO = "auto"
77
+ AUTO_TRAINABLE = "auto_trainable"
78
+ MACHETE = "machete"
79
+ MARLIN = "marlin"
80
+ EXLLAMA_V2 = "exllama_v2"
81
+ EXLLAMA_V1 = "exllama_v1"
82
+ GEMM = "gemm"
83
+ GEMM_TRITON = "gemm_triton"
84
+ GEMV = "gemv"
85
+ GEMV_FAST = "gemv_fast"
86
+ TORCH_AWQ = "torch_awq"
87
+ TORCH_FUSED_AWQ = "torch_fused_awq"
95
88
 
96
89
 
97
90
  @dataclass
@@ -620,7 +613,7 @@ class ExllamaVersion(int, Enum):
620
613
  class GPTQConfig(QuantizationConfigMixin):
621
614
  """
622
615
  This is a wrapper class about all possible attributes and features that you can play with a model that has been
623
- loaded using `optimum` api for gptq quantization relying on auto_gptq backend.
616
+ loaded using `optimum` api for GPTQ quantization relying on the gptqmodel backend.
624
617
 
625
618
  Args:
626
619
  bits (`int`):
@@ -641,22 +634,23 @@ class GPTQConfig(QuantizationConfigMixin):
641
634
  desc_act (`bool`, *optional*, defaults to `False`):
642
635
  Whether to quantize columns in order of decreasing activation size. Setting it to False can significantly
643
636
  speed up inference but the perplexity may become slightly worse. Also known as act-order.
637
+ act_group_aware (`bool`, *optional*, defaults to `True`):
638
+ Use GAR (group aware activation order) during quantization. Has measurable positive impact on quantization
639
+ quality. Only applicable when `desc_act = False`. Will forced to be `False` when `desc_act = True`.
644
640
  sym (`bool`, *optional*, defaults to `True`):
645
641
  Whether to use symmetric quantization.
646
642
  true_sequential (`bool`, *optional*, defaults to `True`):
647
643
  Whether to perform sequential quantization even within a single Transformer block. Instead of quantizing
648
644
  the entire block at once, we perform layer-wise quantization. As a result, each layer undergoes
649
645
  quantization using inputs that have passed through the previously quantized layers.
650
- checkpoint_format (`str`, *optional*, defaults to `"gptq"`):
651
- GPTQ weight format. `gptq`(v1) is supported by both gptqmodel and auto-gptq. `gptq_v2` is gptqmodel only.
646
+ format (`str`, *optional*, defaults to `"gptq"`):
647
+ GPTQ weight format. `gptq` (v1) is supported by gptqmodel. `gptq_v2` is gptqmodel only.
652
648
  meta (`dict[str, any]`, *optional*):
653
649
  Properties, such as tooling:version, that do not directly contributes to quantization or quant inference are stored in meta.
654
650
  i.e. `meta.quantizer`: ["optimum:_version_", "gptqmodel:_version_"]
655
651
  backend (`str`, *optional*):
656
- Controls which gptq kernel to be used. Valid values for gptqmodel are `auto`, `auto_trainable` and more. For auto-gptq, only
657
- valid value is None and `auto_trainable`. Ref gptqmodel backends: https://github.com/ModelCloud/GPTQModel/blob/main/gptqmodel/utils/backend.py
658
- use_cuda_fp16 (`bool`, *optional*, defaults to `False`):
659
- Whether or not to use optimized cuda kernel for fp16 model. Need to have model in fp16. Auto-gptq only.
652
+ Controls which kernel to use. Valid values for gptqmodel are `auto`, `auto_trainable` and more. Ref gptqmodel backends:
653
+ https://github.com/ModelCloud/GPTQModel/blob/main/gptqmodel/utils/backend.py
660
654
  model_seqlen (`int`, *optional*):
661
655
  The maximum sequence length that the model can take.
662
656
  block_name_to_quantize (`str`, *optional*):
@@ -667,14 +661,9 @@ class GPTQConfig(QuantizationConfigMixin):
667
661
  The batch size used when processing the dataset
668
662
  pad_token_id (`int`, *optional*):
669
663
  The pad token id. Needed to prepare the dataset when `batch_size` > 1.
670
- use_exllama (`bool`, *optional*):
671
- Whether to use exllama backend. Defaults to `True` if unset. Only works with `bits` = 4.
672
664
  max_input_length (`int`, *optional*):
673
665
  The maximum input length. This is needed to initialize a buffer that depends on the maximum expected input
674
666
  length. It is specific to the exllama backend with act-order.
675
- exllama_config (`dict[str, Any]`, *optional*):
676
- The exllama config. You can specify the version of the exllama kernel through the `version` key. Defaults
677
- to `{"version": 1}` if unset.
678
667
  cache_block_outputs (`bool`, *optional*, defaults to `True`):
679
668
  Whether to cache block outputs to reuse as inputs for the succeeding block.
680
669
  modules_in_block_to_quantize (`list[list[str]]`, *optional*):
@@ -694,20 +683,18 @@ class GPTQConfig(QuantizationConfigMixin):
694
683
  group_size: int = 128,
695
684
  damp_percent: float = 0.1,
696
685
  desc_act: bool = False,
686
+ act_group_aware: bool = True,
697
687
  sym: bool = True,
698
688
  true_sequential: bool = True,
699
- checkpoint_format: str = "gptq",
700
- meta: dict[str, Any] | None = None,
701
- backend: str | None = None,
702
- use_cuda_fp16: bool = False,
703
- model_seqlen: int | None = None,
704
- block_name_to_quantize: str | None = None,
705
- module_name_preceding_first_block: list[str] | None = None,
689
+ format: str = "gptq",
690
+ meta: Optional[dict[str, Any]] = None,
691
+ backend: Optional[str] = None,
692
+ model_seqlen: Optional[int] = None,
693
+ block_name_to_quantize: Optional[str] = None,
694
+ module_name_preceding_first_block: Optional[list[str]] = None,
706
695
  batch_size: int = 1,
707
- pad_token_id: int | None = None,
708
- use_exllama: bool | None = None,
709
- max_input_length: int | None = None,
710
- exllama_config: dict[str, Any] | None = None,
696
+ pad_token_id: Optional[int] = None,
697
+ max_input_length: Optional[int] = None,
711
698
  cache_block_outputs: bool = True,
712
699
  modules_in_block_to_quantize: list[list[str]] | None = None,
713
700
  **kwargs,
@@ -719,33 +706,28 @@ class GPTQConfig(QuantizationConfigMixin):
719
706
  self.group_size = group_size
720
707
  self.damp_percent = damp_percent
721
708
  self.desc_act = desc_act
709
+ self.act_group_aware = act_group_aware
722
710
  self.sym = sym
723
711
  self.true_sequential = true_sequential
724
- self.checkpoint_format = checkpoint_format.lower()
712
+ self.format = format.lower()
713
+ # Compatible with legacy field: checkpoint_format
714
+ if kwargs.get("checkpoint_format") is not None:
715
+ self.format = kwargs.pop("checkpoint_format").lower()
725
716
  self.meta = meta
726
717
  self.backend = backend.lower() if isinstance(backend, str) else backend
727
- self.use_cuda_fp16 = use_cuda_fp16
728
718
  self.model_seqlen = model_seqlen
729
719
  self.block_name_to_quantize = block_name_to_quantize
730
720
  self.module_name_preceding_first_block = module_name_preceding_first_block
731
721
  self.batch_size = batch_size
732
722
  self.pad_token_id = pad_token_id
733
- self.use_exllama = use_exllama
734
723
  self.max_input_length = max_input_length
735
- self.exllama_config = exllama_config
736
724
  self.cache_block_outputs = cache_block_outputs
737
725
  self.modules_in_block_to_quantize = modules_in_block_to_quantize
738
726
  self.post_init()
739
727
 
740
728
  def get_loading_attributes(self):
741
729
  attributes_dict = copy.deepcopy(self.__dict__)
742
- loading_attributes = [
743
- "use_exllama",
744
- "exllama_config",
745
- "use_cuda_fp16",
746
- "max_input_length",
747
- "backend",
748
- ]
730
+ loading_attributes = ["max_input_length", "backend"]
749
731
  loading_attributes_dict = {i: j for i, j in attributes_dict.items() if i in loading_attributes}
750
732
  return loading_attributes_dict
751
733
 
@@ -772,46 +754,14 @@ class GPTQConfig(QuantizationConfigMixin):
772
754
  ['wikitext2','c4','c4-new'], but we found {self.dataset}"""
773
755
  )
774
756
 
775
- # make sure backend is back/forward compatible with both gptqmodel (full) and auto-gptq (partial)
776
- if is_gptqmodel_available():
777
- # convert auto-gptq control into gptqmodel backend
778
- if self.backend is None:
779
- self.backend = "auto_trainable" if self.use_exllama is not None and not self.use_exllama else "auto"
780
- else:
781
- # convert gptqmodel backend `auto_trainable` into auto-gptq control
782
- if self.backend == "auto_trainable":
783
- self.use_exllama = False
784
-
785
- # auto-gptq specific kernel control logic
786
- if self.use_exllama is None:
787
- # New default behaviour
788
- self.use_exllama = True
757
+ # act_group_order is only applicable when `desc_act = False`
758
+ if self.desc_act and self.act_group_aware:
759
+ self.act_group_aware = False
760
+ logger.warning("`act_group_aware` has been auto-disabled as it is not compatible with `desc_act = True`.")
789
761
 
790
- if self.exllama_config is None:
791
- self.exllama_config = {"version": ExllamaVersion.ONE}
792
- else:
793
- if "version" not in self.exllama_config:
794
- raise ValueError("`exllama_config` needs to have a `version` key.")
795
- elif self.exllama_config["version"] not in [ExllamaVersion.ONE, ExllamaVersion.TWO]:
796
- exllama_version = self.exllama_config["version"]
797
- raise ValueError(
798
- f"Only supported versions are in [ExllamaVersion.ONE, ExllamaVersion.TWO] - not recognized version {exllama_version}"
799
- )
800
-
801
- if self.bits == 4 and self.use_exllama:
802
- if self.exllama_config["version"] == ExllamaVersion.ONE:
803
- logger.info(
804
- "You have activated exllama backend. Note that you can get better inference "
805
- "speed using exllamav2 kernel by setting `exllama_config`."
806
- )
807
- elif self.exllama_config["version"] == ExllamaVersion.TWO:
808
- if is_auto_gptq_available():
809
- optimum_version = version.parse(importlib.metadata.version("optimum"))
810
- autogptq_version = version.parse(importlib.metadata.version("auto_gptq"))
811
- if optimum_version <= version.parse("1.13.2") or autogptq_version <= version.parse("0.4.2"):
812
- raise ValueError(
813
- f"You need optimum > 1.13.2 and auto-gptq > 0.4.2 . Make sure to have that version installed - detected version : optimum {optimum_version} and autogptq {autogptq_version}"
814
- )
762
+ # make sure backend default stays consistent with gptqmodel expectations
763
+ if self.backend is None:
764
+ self.backend = "auto"
815
765
  if self.modules_in_block_to_quantize is not None:
816
766
  optimum_version = version.parse(importlib.metadata.version("optimum"))
817
767
  if optimum_version < version.parse("1.15.0"):
@@ -821,17 +771,15 @@ class GPTQConfig(QuantizationConfigMixin):
821
771
 
822
772
  def to_dict(self) -> dict[str, Any]:
823
773
  config_dict = super().to_dict()
824
- config_dict.pop("disable_exllama", None)
774
+ # Compatible with legacy field: checkpoint_format
775
+ config_dict["checkpoint_format"] = self.format
825
776
  return config_dict
826
777
 
827
778
  def to_dict_optimum(self):
828
779
  """
829
780
  Get compatible dict for optimum gptq config
830
781
  """
831
- quant_dict = self.to_dict()
832
- # make it compatible with optimum config
833
- quant_dict["disable_exllama"] = not self.use_exllama
834
- return quant_dict
782
+ return self.to_dict()
835
783
 
836
784
  @classmethod
837
785
  def from_dict_optimum(cls, config_dict):
@@ -839,17 +787,12 @@ class GPTQConfig(QuantizationConfigMixin):
839
787
  Get compatible class with optimum gptq config dict
840
788
  """
841
789
 
842
- if "disable_exllama" in config_dict:
843
- config_dict["use_exllama"] = not config_dict["disable_exllama"]
844
- # switch to None to not trigger the warning
845
- config_dict.pop("disable_exllama")
846
-
847
790
  config = cls(**config_dict)
848
791
  return config
849
792
 
850
793
 
851
794
  @dataclass
852
- class AwqConfig(QuantizationConfigMixin):
795
+ class AwqConfig(GPTQConfig):
853
796
  """
854
797
  This is a wrapper class about all possible attributes and features that you can play with a model that has been
855
798
  loaded using `auto-awq` library awq quantization relying on auto_awq backend.
@@ -861,26 +804,12 @@ class AwqConfig(QuantizationConfigMixin):
861
804
  The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization.
862
805
  zero_point (`bool`, *optional*, defaults to `True`):
863
806
  Whether to use zero point quantization.
864
- version (`AWQLinearVersion`, *optional*, defaults to `AWQLinearVersion.GEMM`):
865
- The version of the quantization algorithm to use. GEMM is better for big batch_size (e.g. >= 8) otherwise,
866
- GEMV is better (e.g. < 8 ). GEMM models are compatible with Exllama kernels.
867
- backend (`AwqBackendPackingMethod`, *optional*, defaults to `AwqBackendPackingMethod.AUTOAWQ`):
868
- The quantization backend. Some models might be quantized using `llm-awq` backend. This is useful for users
869
- that quantize their own models using `llm-awq` library.
870
- do_fuse (`bool`, *optional*, defaults to `False`):
871
- Whether to fuse attention and mlp layers together for faster inference
872
- fuse_max_seq_len (`int`, *optional*):
873
- The Maximum sequence length to generate when using fusing.
874
- modules_to_fuse (`dict`, *optional*, default to `None`):
875
- Overwrite the natively supported fusing scheme with the one specified by the users.
807
+ backend (`AwqBackend`, *optional*, defaults to `AwqBackend.AUTO`):
808
+ The quantization backend.
876
809
  modules_to_not_convert (`list`, *optional*, default to `None`):
877
810
  The list of modules to not quantize, useful for quantizing models that explicitly require to have
878
811
  some modules left in their original precision (e.g. Whisper encoder, Llava encoder, Mixtral gate layers).
879
812
  Note you cannot quantize directly with transformers, please refer to `AutoAWQ` documentation for quantizing HF models.
880
- exllama_config (`dict[str, Any]`, *optional*):
881
- You can specify the version of the exllama kernel through the `version` key, the maximum sequence
882
- length through the `max_input_len` key, and the maximum batch size through the `max_batch_size` key.
883
- Defaults to `{"version": 2, "max_input_len": 2048, "max_batch_size": 8}` if unset.
884
813
  """
885
814
 
886
815
  def __init__(
@@ -888,141 +817,45 @@ class AwqConfig(QuantizationConfigMixin):
888
817
  bits: int = 4,
889
818
  group_size: int = 128,
890
819
  zero_point: bool = True,
891
- version: AWQLinearVersion = AWQLinearVersion.GEMM,
892
- backend: AwqBackendPackingMethod = AwqBackendPackingMethod.AUTOAWQ,
893
- do_fuse: bool | None = None,
894
- fuse_max_seq_len: int | None = None,
895
- modules_to_fuse: dict | None = None,
820
+ backend: AwqBackend = AwqBackend.AUTO,
896
821
  modules_to_not_convert: list | None = None,
897
- exllama_config: dict[str, int] | None = None,
898
822
  **kwargs,
899
823
  ):
900
- self.quant_method = QuantizationMethod.AWQ
901
-
902
- self.bits = bits
903
- self.group_size = group_size
824
+ format = kwargs.pop("format", AwqFormat.GEMM)
825
+ # Compatible with legacy field: version
826
+ if kwargs.get("version") is not None:
827
+ format = kwargs.pop("version").lower()
828
+ # Compatible with legacy backend
829
+ if backend == AwqBackend.LEGACY_AWQ:
830
+ backend = AwqBackend.AUTO
904
831
  self.zero_point = zero_point
905
- self.version = version
906
- self.backend = backend
907
- self.fuse_max_seq_len = fuse_max_seq_len
908
832
  self.modules_to_not_convert = modules_to_not_convert
909
- self.exllama_config = exllama_config
910
-
911
- self.modules_to_fuse = modules_to_fuse
912
- if do_fuse is None:
913
- self.do_fuse = modules_to_fuse is not None and len(modules_to_fuse) > 0
914
- else:
915
- self.do_fuse = do_fuse
916
- self.fuse_max_seq_len = fuse_max_seq_len
917
833
 
918
- self.post_init()
834
+ super().__init__(bits=bits, group_size=group_size, backend=backend, format=format, **kwargs)
835
+ self.quant_method = QuantizationMethod.AWQ
919
836
 
920
837
  def post_init(self):
921
838
  r"""
922
839
  Safety checker that arguments are correct
923
840
  """
924
- if self.backend not in [AwqBackendPackingMethod.AUTOAWQ, AwqBackendPackingMethod.LLMAWQ]:
925
- raise ValueError(
926
- f"Only supported quantization backends in {AwqBackendPackingMethod.AUTOAWQ} and {AwqBackendPackingMethod.LLMAWQ} - not recognized backend {self.backend}"
927
- )
928
-
929
- self.version = AWQLinearVersion.from_str(self.version)
930
- if self.version not in [
931
- AWQLinearVersion.GEMM,
932
- AWQLinearVersion.GEMV,
933
- AWQLinearVersion.EXLLAMA,
934
- AWQLinearVersion.IPEX,
841
+ if self.format not in [
842
+ AwqFormat.GEMM,
843
+ AwqFormat.GEMV,
844
+ AwqFormat.GEMV_FAST,
935
845
  ]:
936
846
  raise ValueError(
937
- f"Only supported versions are in [AWQLinearVersion.GEMM, AWQLinearVersion.GEMV, AWQLinearVersion.EXLLAMA, AWQLinearVersion.IPEX] - not recognized version {self.version}"
847
+ f"Only supported versions are in [AWQLinearVersion.GEMM, AWQLinearVersion.GEMV, AWQLinearVersion.GEMV_FAST] - not recognized version {self.format}"
938
848
  )
939
849
 
940
- if self.backend == AwqBackendPackingMethod.LLMAWQ:
941
- # Only cuda device can run this function
942
- if not (torch.cuda.is_available() or torch.xpu.is_available()):
943
- raise ValueError("LLM-AWQ backend is only supported on CUDA and XPU")
944
- if torch.cuda.is_available():
945
- compute_capability = torch.cuda.get_device_capability()
946
- major, minor = compute_capability
947
- if major < 8:
948
- raise ValueError("LLM-AWQ backend is only supported on CUDA GPUs with compute capability >= 8.0")
949
-
950
- if self.do_fuse and self.fuse_max_seq_len is None:
951
- raise ValueError(
952
- "You cannot enable fused modules without specifying a `fuse_max_seq_len`, make sure to pass a valid `fuse_max_seq_len` for your usecase"
953
- )
850
+ if self.backend not in AwqBackend.__members__.values():
851
+ raise ValueError(f"Invalid backend '{self.backend}'. Must be one of: {[b.value for b in AwqBackend]}")
954
852
 
955
- if self.do_fuse:
956
- awq_version_supports_fusing = False
957
- MIN_AWQ_VERSION = "0.1.7"
958
- if is_auto_awq_available():
959
- awq_version_supports_fusing = version.parse(importlib.metadata.version("autoawq")) >= version.parse(
960
- MIN_AWQ_VERSION
961
- )
962
-
963
- if not awq_version_supports_fusing:
964
- raise ValueError(
965
- f"You current version of `autoawq` does not support module fusing, please upgrade `autoawq` package to at least {MIN_AWQ_VERSION}."
966
- )
967
-
968
- if self.modules_to_not_convert is not None:
969
- awq_version_supports_non_conversion = False
970
- MIN_AWQ_VERSION = "0.1.8"
971
- if is_auto_awq_available():
972
- awq_version_supports_non_conversion = version.parse(
973
- importlib.metadata.version("autoawq")
974
- ) >= version.parse(MIN_AWQ_VERSION)
975
-
976
- if not awq_version_supports_non_conversion:
977
- raise ValueError(
978
- f"You current version of `autoawq` does not support module quantization skipping, please upgrade `autoawq` package to at least {MIN_AWQ_VERSION}."
979
- )
980
-
981
- if self.do_fuse and self.modules_to_fuse is not None:
982
- required_keys = [
983
- "hidden_size",
984
- "num_attention_heads",
985
- "num_key_value_heads",
986
- "mlp",
987
- "attention",
988
- "layernorm",
989
- "use_alibi",
990
- ]
991
- if not all(key in self.modules_to_fuse for key in required_keys):
992
- raise ValueError(
993
- f"Required fields are missing in the fusing mapping, required fields are {required_keys}"
994
- )
995
-
996
- if self.version == AWQLinearVersion.EXLLAMA:
997
- awq_version_supports_exllama = False
998
- MIN_AWQ_VERSION = "0.2.0"
999
- if is_auto_awq_available():
1000
- awq_version_supports_exllama = version.parse(importlib.metadata.version("autoawq")) >= version.parse(
1001
- MIN_AWQ_VERSION
1002
- )
1003
-
1004
- if not awq_version_supports_exllama:
1005
- raise ValueError(
1006
- f"You current version of `autoawq` does not support exllama backend, "
1007
- f"please upgrade `autoawq` package to at least {MIN_AWQ_VERSION}."
1008
- )
1009
-
1010
- if self.exllama_config is None:
1011
- self.exllama_config = {"version": ExllamaVersion.TWO, "max_input_len": 2048, "max_batch_size": 8}
1012
- else:
1013
- if "version" not in self.exllama_config:
1014
- raise ValueError("`exllama_config` needs to have a `version` key.")
1015
- elif self.exllama_config["version"] not in [ExllamaVersion.ONE, ExllamaVersion.TWO]:
1016
- exllama_version = self.exllama_config["version"]
1017
- raise ValueError(
1018
- f"Only supported versions are in [ExllamaVersion.ONE, ExllamaVersion.TWO] - not recognized version {exllama_version}"
1019
- )
1020
-
1021
- def get_loading_attributes(self):
1022
- attributes_dict = copy.deepcopy(self.__dict__)
1023
- loading_attributes = ["version", "do_fuse", "modules_to_fuse", "fuse_max_seq_len", "exllama_config"]
1024
- loading_attributes_dict = {i: j for i, j in attributes_dict.items() if i in loading_attributes}
1025
- return loading_attributes_dict
853
+ def to_dict(self) -> dict[str, Any]:
854
+ config_dict = super().to_dict()
855
+ config_dict.pop("checkpoint_format")
856
+ # Compatible with legacy field: version
857
+ config_dict["version"] = self.format
858
+ return config_dict
1026
859
 
1027
860
 
1028
861
  @dataclass
@@ -22,7 +22,7 @@ from functools import partial
22
22
  from typing import Any, Optional, Union
23
23
 
24
24
  import numpy as np
25
- from huggingface_hub import create_repo
25
+ from huggingface_hub import create_repo, is_offline_mode
26
26
  from huggingface_hub.dataclasses import validate_typed_dict
27
27
 
28
28
  from .dynamic_module_utils import custom_object_save
@@ -44,7 +44,6 @@ from .utils import (
44
44
  TensorType,
45
45
  add_start_docstrings,
46
46
  copy_func,
47
- is_offline_mode,
48
47
  is_torch_available,
49
48
  is_torchcodec_available,
50
49
  is_torchvision_v2_available,