transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (539) hide show
  1. transformers/__init__.py +30 -3
  2. transformers/cli/serve.py +47 -17
  3. transformers/conversion_mapping.py +15 -2
  4. transformers/convert_slow_tokenizer.py +225 -10
  5. transformers/core_model_loading.py +196 -135
  6. transformers/data/data_collator.py +12 -4
  7. transformers/dependency_versions_table.py +1 -2
  8. transformers/dynamic_module_utils.py +1 -2
  9. transformers/feature_extraction_utils.py +1 -2
  10. transformers/file_utils.py +0 -1
  11. transformers/generation/__init__.py +11 -1
  12. transformers/generation/configuration_utils.py +3 -2
  13. transformers/generation/continuous_batching/__init__.py +4 -0
  14. transformers/generation/continuous_batching/continuous_api.py +134 -79
  15. transformers/image_processing_base.py +1 -2
  16. transformers/integrations/__init__.py +4 -2
  17. transformers/integrations/accelerate.py +15 -3
  18. transformers/integrations/aqlm.py +38 -66
  19. transformers/integrations/awq.py +48 -514
  20. transformers/integrations/bitnet.py +45 -100
  21. transformers/integrations/bitsandbytes.py +79 -191
  22. transformers/integrations/deepspeed.py +1 -0
  23. transformers/integrations/eetq.py +84 -79
  24. transformers/integrations/fbgemm_fp8.py +191 -145
  25. transformers/integrations/finegrained_fp8.py +236 -193
  26. transformers/integrations/fp_quant.py +92 -0
  27. transformers/integrations/ggml.py +11 -1
  28. transformers/integrations/higgs.py +40 -62
  29. transformers/integrations/hub_kernels.py +42 -3
  30. transformers/integrations/integration_utils.py +10 -0
  31. transformers/integrations/mxfp4.py +25 -65
  32. transformers/integrations/peft.py +7 -29
  33. transformers/integrations/quanto.py +73 -55
  34. transformers/integrations/quark.py +55 -0
  35. transformers/integrations/spqr.py +44 -90
  36. transformers/integrations/torchao.py +32 -38
  37. transformers/integrations/vptq.py +42 -59
  38. transformers/modelcard.py +1 -2
  39. transformers/modeling_gguf_pytorch_utils.py +8 -0
  40. transformers/modeling_rope_utils.py +30 -6
  41. transformers/modeling_utils.py +116 -112
  42. transformers/models/__init__.py +3 -0
  43. transformers/models/afmoe/modeling_afmoe.py +4 -4
  44. transformers/models/albert/tokenization_albert.py +6 -12
  45. transformers/models/align/modeling_align.py +2 -0
  46. transformers/models/altclip/modeling_altclip.py +4 -0
  47. transformers/models/apertus/modeling_apertus.py +4 -4
  48. transformers/models/arcee/modeling_arcee.py +4 -4
  49. transformers/models/aria/modeling_aria.py +4 -4
  50. transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
  51. transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
  52. transformers/models/auto/configuration_auto.py +11 -0
  53. transformers/models/auto/feature_extraction_auto.py +2 -0
  54. transformers/models/auto/image_processing_auto.py +1 -0
  55. transformers/models/auto/modeling_auto.py +6 -0
  56. transformers/models/auto/processing_auto.py +18 -10
  57. transformers/models/auto/tokenization_auto.py +74 -472
  58. transformers/models/autoformer/modeling_autoformer.py +4 -0
  59. transformers/models/bamba/modeling_bamba.py +4 -3
  60. transformers/models/bark/modeling_bark.py +2 -0
  61. transformers/models/bart/modeling_bart.py +7 -0
  62. transformers/models/barthez/tokenization_barthez.py +5 -10
  63. transformers/models/beit/modeling_beit.py +6 -1
  64. transformers/models/bert/tokenization_bert.py +8 -21
  65. transformers/models/big_bird/modeling_big_bird.py +6 -0
  66. transformers/models/big_bird/tokenization_big_bird.py +18 -42
  67. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +8 -2
  68. transformers/models/biogpt/modeling_biogpt.py +2 -0
  69. transformers/models/biogpt/modular_biogpt.py +2 -0
  70. transformers/models/bit/modeling_bit.py +11 -2
  71. transformers/models/bitnet/modeling_bitnet.py +4 -4
  72. transformers/models/blenderbot/modeling_blenderbot.py +5 -0
  73. transformers/models/blenderbot/tokenization_blenderbot.py +12 -16
  74. transformers/models/blenderbot_small/modeling_blenderbot_small.py +5 -0
  75. transformers/models/blip/modeling_blip_text.py +2 -0
  76. transformers/models/blip_2/modeling_blip_2.py +2 -1
  77. transformers/models/bloom/modeling_bloom.py +4 -0
  78. transformers/models/blt/modeling_blt.py +2 -2
  79. transformers/models/blt/modular_blt.py +2 -2
  80. transformers/models/bridgetower/modeling_bridgetower.py +5 -1
  81. transformers/models/bros/modeling_bros.py +4 -0
  82. transformers/models/camembert/tokenization_camembert.py +8 -12
  83. transformers/models/canine/modeling_canine.py +5 -0
  84. transformers/models/chameleon/modeling_chameleon.py +2 -1
  85. transformers/models/chinese_clip/modeling_chinese_clip.py +3 -0
  86. transformers/models/clap/modeling_clap.py +5 -0
  87. transformers/models/clip/tokenization_clip.py +22 -44
  88. transformers/models/clipseg/modeling_clipseg.py +5 -0
  89. transformers/models/clvp/modeling_clvp.py +5 -0
  90. transformers/models/clvp/tokenization_clvp.py +1 -63
  91. transformers/models/code_llama/tokenization_code_llama.py +20 -43
  92. transformers/models/codegen/tokenization_codegen.py +14 -43
  93. transformers/models/cohere/modeling_cohere.py +4 -3
  94. transformers/models/cohere/modular_cohere.py +2 -1
  95. transformers/models/cohere/tokenization_cohere.py +12 -42
  96. transformers/models/cohere2/modeling_cohere2.py +7 -6
  97. transformers/models/cohere2/modular_cohere2.py +5 -5
  98. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -3
  99. transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
  100. transformers/models/colqwen2/modeling_colqwen2.py +1 -0
  101. transformers/models/colqwen2/modular_colqwen2.py +1 -0
  102. transformers/models/conditional_detr/modeling_conditional_detr.py +5 -0
  103. transformers/models/convbert/modeling_convbert.py +6 -0
  104. transformers/models/convnext/modeling_convnext.py +2 -4
  105. transformers/models/convnextv2/modeling_convnextv2.py +2 -4
  106. transformers/models/csm/modeling_csm.py +4 -3
  107. transformers/models/ctrl/modeling_ctrl.py +1 -0
  108. transformers/models/cvt/modeling_cvt.py +2 -0
  109. transformers/models/cwm/modeling_cwm.py +4 -4
  110. transformers/models/d_fine/modeling_d_fine.py +2 -0
  111. transformers/models/d_fine/modular_d_fine.py +1 -0
  112. transformers/models/dab_detr/modeling_dab_detr.py +4 -0
  113. transformers/models/dac/modeling_dac.py +2 -2
  114. transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
  115. transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
  116. transformers/models/dbrx/modeling_dbrx.py +2 -2
  117. transformers/models/deberta/modeling_deberta.py +5 -0
  118. transformers/models/deberta/tokenization_deberta.py +11 -20
  119. transformers/models/deberta_v2/modeling_deberta_v2.py +6 -0
  120. transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
  121. transformers/models/decision_transformer/modeling_decision_transformer.py +4 -1
  122. transformers/models/deepseek_v2/modeling_deepseek_v2.py +2 -3
  123. transformers/models/deepseek_v2/modular_deepseek_v2.py +2 -2
  124. transformers/models/deepseek_v3/modeling_deepseek_v3.py +3 -2
  125. transformers/models/deepseek_v3/modular_deepseek_v3.py +1 -0
  126. transformers/models/deformable_detr/modeling_deformable_detr.py +4 -0
  127. transformers/models/depth_anything/modeling_depth_anything.py +1 -0
  128. transformers/models/depth_pro/modeling_depth_pro.py +2 -0
  129. transformers/models/detr/modeling_detr.py +5 -0
  130. transformers/models/dia/modeling_dia.py +4 -3
  131. transformers/models/dia/modular_dia.py +0 -1
  132. transformers/models/diffllama/modeling_diffllama.py +2 -2
  133. transformers/models/dinat/modeling_dinat.py +3 -0
  134. transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
  135. transformers/models/dinov3_vit/modeling_dinov3_vit.py +2 -2
  136. transformers/models/dinov3_vit/modular_dinov3_vit.py +2 -2
  137. transformers/models/distilbert/tokenization_distilbert.py +13 -0
  138. transformers/models/doge/modeling_doge.py +2 -3
  139. transformers/models/doge/modular_doge.py +0 -1
  140. transformers/models/donut/modeling_donut_swin.py +2 -0
  141. transformers/models/dots1/modeling_dots1.py +10 -7
  142. transformers/models/dots1/modular_dots1.py +5 -3
  143. transformers/models/dpr/modeling_dpr.py +5 -0
  144. transformers/models/dpr/tokenization_dpr.py +12 -0
  145. transformers/models/edgetam/modeling_edgetam.py +1 -1
  146. transformers/models/edgetam_video/modeling_edgetam_video.py +1 -0
  147. transformers/models/edgetam_video/modular_edgetam_video.py +1 -0
  148. transformers/models/efficientloftr/modeling_efficientloftr.py +2 -2
  149. transformers/models/efficientnet/modeling_efficientnet.py +2 -0
  150. transformers/models/emu3/modeling_emu3.py +4 -4
  151. transformers/models/eomt/image_processing_eomt.py +13 -1
  152. transformers/models/eomt/image_processing_eomt_fast.py +14 -2
  153. transformers/models/ernie4_5/modeling_ernie4_5.py +4 -4
  154. transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
  155. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +5 -5
  156. transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +2 -2
  157. transformers/models/esm/modeling_esmfold.py +5 -4
  158. transformers/models/evolla/modeling_evolla.py +4 -4
  159. transformers/models/exaone4/modeling_exaone4.py +2 -2
  160. transformers/models/exaone4/modular_exaone4.py +0 -1
  161. transformers/models/falcon/modeling_falcon.py +6 -1
  162. transformers/models/falcon_h1/modeling_falcon_h1.py +4 -3
  163. transformers/models/falcon_mamba/modeling_falcon_mamba.py +25 -35
  164. transformers/models/falcon_mamba/modular_falcon_mamba.py +12 -31
  165. transformers/{kernels/falcon_mamba → models/fast_vlm}/__init__.py +15 -3
  166. transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
  167. transformers/models/fast_vlm/modeling_fast_vlm.py +455 -0
  168. transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
  169. transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +8 -3
  170. transformers/models/flaubert/modeling_flaubert.py +7 -0
  171. transformers/models/flava/modeling_flava.py +6 -1
  172. transformers/models/flex_olmo/modeling_flex_olmo.py +4 -5
  173. transformers/models/florence2/modeling_florence2.py +2 -1
  174. transformers/models/florence2/modular_florence2.py +2 -1
  175. transformers/models/fnet/modeling_fnet.py +7 -0
  176. transformers/models/focalnet/modeling_focalnet.py +4 -0
  177. transformers/models/fsmt/modeling_fsmt.py +2 -0
  178. transformers/models/funnel/modeling_funnel.py +8 -0
  179. transformers/models/funnel/tokenization_funnel.py +17 -24
  180. transformers/models/fuyu/processing_fuyu.py +3 -3
  181. transformers/models/gemma/modeling_gemma.py +4 -4
  182. transformers/models/gemma/tokenization_gemma.py +10 -27
  183. transformers/models/gemma2/modeling_gemma2.py +4 -4
  184. transformers/models/gemma2/modular_gemma2.py +2 -1
  185. transformers/models/gemma3/modeling_gemma3.py +14 -84
  186. transformers/models/gemma3/modular_gemma3.py +12 -81
  187. transformers/models/gemma3n/modeling_gemma3n.py +18 -209
  188. transformers/models/gemma3n/modular_gemma3n.py +17 -59
  189. transformers/models/git/modeling_git.py +2 -0
  190. transformers/models/glm/modeling_glm.py +4 -4
  191. transformers/models/glm4/modeling_glm4.py +4 -4
  192. transformers/models/glm4_moe/modeling_glm4_moe.py +5 -3
  193. transformers/models/glm4v/configuration_glm4v.py +3 -1
  194. transformers/models/glm4v/modeling_glm4v.py +3 -3
  195. transformers/models/glm4v/modular_glm4v.py +6 -4
  196. transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
  197. transformers/models/glm4v_moe/modeling_glm4v_moe.py +6 -5
  198. transformers/models/glm4v_moe/modular_glm4v_moe.py +1 -1
  199. transformers/models/glpn/modeling_glpn.py +2 -0
  200. transformers/models/gpt2/modeling_gpt2.py +5 -1
  201. transformers/models/gpt2/tokenization_gpt2.py +16 -44
  202. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +1 -0
  203. transformers/models/gpt_neo/modeling_gpt_neo.py +4 -0
  204. transformers/models/gpt_neox/modeling_gpt_neox.py +5 -2
  205. transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
  206. transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
  207. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +3 -1
  208. transformers/models/gpt_oss/modeling_gpt_oss.py +5 -6
  209. transformers/models/gpt_oss/modular_gpt_oss.py +3 -5
  210. transformers/models/gptj/modeling_gptj.py +3 -0
  211. transformers/models/granite/modeling_granite.py +4 -4
  212. transformers/models/granitemoe/modeling_granitemoe.py +4 -6
  213. transformers/models/granitemoe/modular_granitemoe.py +0 -2
  214. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +4 -6
  215. transformers/models/granitemoeshared/modeling_granitemoeshared.py +4 -6
  216. transformers/models/grounding_dino/modeling_grounding_dino.py +4 -0
  217. transformers/models/groupvit/modeling_groupvit.py +3 -0
  218. transformers/models/helium/modeling_helium.py +4 -3
  219. transformers/models/herbert/tokenization_herbert.py +9 -25
  220. transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -1
  221. transformers/models/hgnet_v2/modular_hgnet_v2.py +6 -1
  222. transformers/models/hiera/modeling_hiera.py +4 -0
  223. transformers/models/hubert/modeling_hubert.py +3 -0
  224. transformers/models/hubert/modular_hubert.py +1 -0
  225. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +4 -4
  226. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +4 -4
  227. transformers/models/ibert/modeling_ibert.py +6 -0
  228. transformers/models/idefics/modeling_idefics.py +5 -21
  229. transformers/models/imagegpt/modeling_imagegpt.py +2 -1
  230. transformers/models/informer/modeling_informer.py +4 -0
  231. transformers/models/informer/modular_informer.py +1 -0
  232. transformers/models/internvl/modeling_internvl.py +2 -4
  233. transformers/models/internvl/modular_internvl.py +2 -4
  234. transformers/models/jamba/modeling_jamba.py +2 -2
  235. transformers/models/janus/modeling_janus.py +1 -0
  236. transformers/models/janus/modular_janus.py +1 -0
  237. transformers/models/jetmoe/modeling_jetmoe.py +2 -2
  238. transformers/models/kosmos2/modeling_kosmos2.py +1 -0
  239. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +3 -1
  240. transformers/models/lasr/__init__.py +29 -0
  241. transformers/models/lasr/configuration_lasr.py +244 -0
  242. transformers/models/lasr/feature_extraction_lasr.py +277 -0
  243. transformers/models/lasr/modeling_lasr.py +729 -0
  244. transformers/models/lasr/modular_lasr.py +569 -0
  245. transformers/models/lasr/processing_lasr.py +96 -0
  246. transformers/models/lasr/tokenization_lasr.py +186 -0
  247. transformers/models/layoutlm/modeling_layoutlm.py +5 -0
  248. transformers/models/layoutlmv2/modeling_layoutlmv2.py +4 -0
  249. transformers/models/layoutlmv2/tokenization_layoutlmv2.py +10 -53
  250. transformers/models/layoutlmv3/modeling_layoutlmv3.py +4 -0
  251. transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
  252. transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
  253. transformers/models/led/modeling_led.py +6 -0
  254. transformers/models/levit/modeling_levit.py +3 -0
  255. transformers/models/lfm2/modeling_lfm2.py +4 -5
  256. transformers/models/lfm2/modular_lfm2.py +0 -1
  257. transformers/models/lfm2_moe/modeling_lfm2_moe.py +4 -5
  258. transformers/models/lightglue/modeling_lightglue.py +3 -1
  259. transformers/models/lightglue/modular_lightglue.py +1 -0
  260. transformers/models/lilt/modeling_lilt.py +4 -0
  261. transformers/models/llama/modeling_llama.py +4 -4
  262. transformers/models/llama/tokenization_llama.py +15 -43
  263. transformers/models/llama4/modeling_llama4.py +3 -2
  264. transformers/models/longcat_flash/modeling_longcat_flash.py +4 -4
  265. transformers/models/longcat_flash/modular_longcat_flash.py +2 -2
  266. transformers/models/longformer/modeling_longformer.py +6 -0
  267. transformers/models/longt5/modeling_longt5.py +4 -0
  268. transformers/models/luke/modeling_luke.py +9 -0
  269. transformers/models/luke/tokenization_luke.py +11 -38
  270. transformers/models/lxmert/modeling_lxmert.py +2 -0
  271. transformers/models/m2m_100/modeling_m2m_100.py +4 -0
  272. transformers/models/mamba/modeling_mamba.py +14 -22
  273. transformers/models/marian/modeling_marian.py +5 -0
  274. transformers/models/markuplm/modeling_markuplm.py +4 -0
  275. transformers/models/markuplm/tokenization_markuplm.py +28 -61
  276. transformers/models/mask2former/modeling_mask2former.py +2 -0
  277. transformers/models/maskformer/modeling_maskformer.py +2 -0
  278. transformers/models/maskformer/modeling_maskformer_swin.py +2 -0
  279. transformers/models/mbart/modeling_mbart.py +7 -0
  280. transformers/models/mbart/tokenization_mbart.py +11 -52
  281. transformers/models/mbart50/tokenization_mbart50.py +7 -10
  282. transformers/models/megatron_bert/modeling_megatron_bert.py +7 -0
  283. transformers/models/mgp_str/modeling_mgp_str.py +2 -0
  284. transformers/models/mimi/modeling_mimi.py +3 -1
  285. transformers/models/minimax/modeling_minimax.py +4 -4
  286. transformers/models/ministral/modeling_ministral.py +4 -4
  287. transformers/models/ministral3/configuration_ministral3.py +1 -1
  288. transformers/models/ministral3/modeling_ministral3.py +4 -3
  289. transformers/models/mistral/modeling_mistral.py +4 -3
  290. transformers/models/mixtral/modeling_mixtral.py +4 -4
  291. transformers/models/mllama/modeling_mllama.py +2 -2
  292. transformers/models/mluke/tokenization_mluke.py +6 -6
  293. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +4 -0
  294. transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
  295. transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
  296. transformers/models/mobilevit/modeling_mobilevit.py +3 -0
  297. transformers/models/mobilevitv2/modeling_mobilevitv2.py +3 -0
  298. transformers/models/modernbert/modeling_modernbert.py +4 -1
  299. transformers/models/modernbert/modular_modernbert.py +2 -0
  300. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +8 -9
  301. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +6 -7
  302. transformers/models/moonshine/modeling_moonshine.py +4 -2
  303. transformers/models/moshi/modeling_moshi.py +5 -2
  304. transformers/models/mpnet/modeling_mpnet.py +5 -0
  305. transformers/models/mpnet/tokenization_mpnet.py +5 -13
  306. transformers/models/mpt/modeling_mpt.py +2 -0
  307. transformers/models/mra/modeling_mra.py +6 -0
  308. transformers/models/mt5/modeling_mt5.py +7 -0
  309. transformers/models/musicgen/modeling_musicgen.py +2 -0
  310. transformers/models/musicgen_melody/modeling_musicgen_melody.py +3 -0
  311. transformers/models/mvp/modeling_mvp.py +7 -0
  312. transformers/models/nanochat/modeling_nanochat.py +4 -4
  313. transformers/models/nemotron/modeling_nemotron.py +4 -2
  314. transformers/models/nllb/tokenization_nllb.py +8 -22
  315. transformers/models/nougat/tokenization_nougat.py +11 -59
  316. transformers/models/nystromformer/modeling_nystromformer.py +6 -0
  317. transformers/models/olmo/modeling_olmo.py +4 -4
  318. transformers/models/olmo/modular_olmo.py +2 -2
  319. transformers/models/olmo2/modeling_olmo2.py +4 -5
  320. transformers/models/olmo2/modular_olmo2.py +0 -1
  321. transformers/models/olmo3/modeling_olmo3.py +4 -4
  322. transformers/models/olmoe/modeling_olmoe.py +4 -4
  323. transformers/models/omdet_turbo/modeling_omdet_turbo.py +2 -0
  324. transformers/models/oneformer/modeling_oneformer.py +4 -1
  325. transformers/models/openai/modeling_openai.py +3 -0
  326. transformers/models/openai/tokenization_openai.py +10 -46
  327. transformers/models/opt/modeling_opt.py +2 -0
  328. transformers/models/owlv2/modeling_owlv2.py +4 -0
  329. transformers/models/owlvit/modeling_owlvit.py +4 -0
  330. transformers/models/paddleocr_vl/__init__.py +32 -0
  331. transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
  332. transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +503 -0
  333. transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
  334. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1668 -0
  335. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1349 -0
  336. transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
  337. transformers/models/parakeet/configuration_parakeet.py +4 -6
  338. transformers/models/parakeet/modeling_parakeet.py +9 -6
  339. transformers/models/parakeet/modular_parakeet.py +2 -2
  340. transformers/models/parakeet/processing_parakeet.py +1 -0
  341. transformers/models/patchtsmixer/modeling_patchtsmixer.py +6 -0
  342. transformers/models/patchtst/modeling_patchtst.py +20 -2
  343. transformers/models/pegasus/modeling_pegasus.py +5 -0
  344. transformers/models/pegasus/tokenization_pegasus.py +17 -44
  345. transformers/models/pegasus_x/modeling_pegasus_x.py +4 -0
  346. transformers/models/perceiver/modeling_perceiver.py +8 -0
  347. transformers/models/persimmon/modeling_persimmon.py +2 -1
  348. transformers/models/phi/modeling_phi.py +4 -5
  349. transformers/models/phi/modular_phi.py +0 -1
  350. transformers/models/phi3/modeling_phi3.py +2 -1
  351. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +5 -5
  352. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +4 -4
  353. transformers/models/phimoe/modeling_phimoe.py +4 -4
  354. transformers/models/phimoe/modular_phimoe.py +2 -2
  355. transformers/models/pix2struct/modeling_pix2struct.py +2 -0
  356. transformers/models/pixtral/modeling_pixtral.py +2 -1
  357. transformers/models/plbart/modeling_plbart.py +6 -0
  358. transformers/models/plbart/modular_plbart.py +2 -0
  359. transformers/models/plbart/tokenization_plbart.py +0 -2
  360. transformers/models/poolformer/modeling_poolformer.py +2 -0
  361. transformers/models/pop2piano/modeling_pop2piano.py +2 -0
  362. transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
  363. transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
  364. transformers/models/prophetnet/modeling_prophetnet.py +3 -0
  365. transformers/models/pvt/modeling_pvt.py +2 -0
  366. transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
  367. transformers/models/qwen2/modeling_qwen2.py +4 -4
  368. transformers/models/qwen2/tokenization_qwen2.py +14 -18
  369. transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
  370. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +13 -16
  371. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +14 -16
  372. transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
  373. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +5 -6
  374. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +3 -5
  375. transformers/models/qwen2_audio/modeling_qwen2_audio.py +2 -0
  376. transformers/models/qwen2_moe/modeling_qwen2_moe.py +4 -4
  377. transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
  378. transformers/models/qwen2_vl/modeling_qwen2_vl.py +6 -16
  379. transformers/models/qwen3/modeling_qwen3.py +4 -4
  380. transformers/models/qwen3_moe/modeling_qwen3_moe.py +4 -4
  381. transformers/models/qwen3_next/modeling_qwen3_next.py +4 -3
  382. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +21 -23
  383. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +14 -16
  384. transformers/models/qwen3_vl/modeling_qwen3_vl.py +39 -37
  385. transformers/models/qwen3_vl/modular_qwen3_vl.py +37 -35
  386. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +39 -37
  387. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +4 -1
  388. transformers/models/rag/modeling_rag.py +1 -0
  389. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +15 -1
  390. transformers/models/reformer/modeling_reformer.py +4 -0
  391. transformers/models/reformer/tokenization_reformer.py +11 -28
  392. transformers/models/regnet/modeling_regnet.py +6 -1
  393. transformers/models/rembert/modeling_rembert.py +6 -0
  394. transformers/models/rembert/tokenization_rembert.py +3 -10
  395. transformers/models/resnet/modeling_resnet.py +11 -2
  396. transformers/models/roberta/tokenization_roberta.py +18 -27
  397. transformers/models/roformer/modeling_roformer.py +6 -0
  398. transformers/models/roformer/tokenization_roformer.py +77 -412
  399. transformers/models/rt_detr/modeling_rt_detr.py +2 -0
  400. transformers/models/rt_detr/modeling_rt_detr_resnet.py +5 -1
  401. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +2 -0
  402. transformers/models/rwkv/modeling_rwkv.py +1 -0
  403. transformers/models/sam2/modeling_sam2.py +2 -2
  404. transformers/models/sam2/modular_sam2.py +2 -2
  405. transformers/models/sam2_video/modeling_sam2_video.py +1 -0
  406. transformers/models/sam2_video/modular_sam2_video.py +1 -0
  407. transformers/models/sam3/modeling_sam3.py +77 -80
  408. transformers/models/sam3_tracker/modeling_sam3_tracker.py +6 -1
  409. transformers/models/sam3_tracker/modular_sam3_tracker.py +6 -1
  410. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +1 -0
  411. transformers/models/sam3_video/modeling_sam3_video.py +1 -0
  412. transformers/models/seamless_m4t/modeling_seamless_m4t.py +5 -1
  413. transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
  414. transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +5 -1
  415. transformers/models/seed_oss/modeling_seed_oss.py +2 -2
  416. transformers/models/segformer/modeling_segformer.py +4 -1
  417. transformers/models/seggpt/modeling_seggpt.py +2 -0
  418. transformers/models/sew/modeling_sew.py +3 -0
  419. transformers/models/sew/modular_sew.py +1 -0
  420. transformers/models/sew_d/modeling_sew_d.py +3 -0
  421. transformers/models/siglip2/modeling_siglip2.py +4 -0
  422. transformers/models/siglip2/modular_siglip2.py +4 -0
  423. transformers/models/smollm3/modeling_smollm3.py +4 -4
  424. transformers/models/smolvlm/processing_smolvlm.py +0 -7
  425. transformers/models/speech_to_text/modeling_speech_to_text.py +4 -0
  426. transformers/models/speecht5/modeling_speecht5.py +13 -1
  427. transformers/models/splinter/modeling_splinter.py +3 -0
  428. transformers/models/splinter/tokenization_splinter.py +9 -28
  429. transformers/models/squeezebert/modeling_squeezebert.py +6 -0
  430. transformers/models/stablelm/modeling_stablelm.py +3 -1
  431. transformers/models/starcoder2/modeling_starcoder2.py +4 -3
  432. transformers/models/superglue/modeling_superglue.py +1 -0
  433. transformers/models/superpoint/modeling_superpoint.py +1 -0
  434. transformers/models/swiftformer/modeling_swiftformer.py +2 -0
  435. transformers/models/swin/modeling_swin.py +4 -0
  436. transformers/models/swin2sr/modeling_swin2sr.py +2 -0
  437. transformers/models/swinv2/modeling_swinv2.py +4 -0
  438. transformers/models/t5/modeling_t5.py +7 -0
  439. transformers/models/t5/tokenization_t5.py +4 -8
  440. transformers/models/t5gemma/modeling_t5gemma.py +5 -5
  441. transformers/models/t5gemma2/modeling_t5gemma2.py +6 -6
  442. transformers/models/table_transformer/modeling_table_transformer.py +4 -0
  443. transformers/models/tapas/modeling_tapas.py +3 -0
  444. transformers/models/textnet/modeling_textnet.py +11 -2
  445. transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
  446. transformers/models/timesfm/modeling_timesfm.py +2 -0
  447. transformers/models/timesfm/modular_timesfm.py +2 -0
  448. transformers/models/timesformer/modeling_timesformer.py +2 -0
  449. transformers/models/timm_wrapper/modeling_timm_wrapper.py +1 -1
  450. transformers/models/trocr/modeling_trocr.py +2 -0
  451. transformers/models/tvp/modeling_tvp.py +2 -0
  452. transformers/models/udop/modeling_udop.py +4 -0
  453. transformers/models/udop/tokenization_udop.py +5 -13
  454. transformers/models/umt5/modeling_umt5.py +7 -0
  455. transformers/models/unispeech/modeling_unispeech.py +4 -0
  456. transformers/models/unispeech/modular_unispeech.py +2 -0
  457. transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
  458. transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
  459. transformers/models/univnet/modeling_univnet.py +1 -0
  460. transformers/models/upernet/modeling_upernet.py +1 -0
  461. transformers/models/vaultgemma/modeling_vaultgemma.py +4 -4
  462. transformers/models/vilt/modeling_vilt.py +6 -0
  463. transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
  464. transformers/models/visual_bert/modeling_visual_bert.py +6 -0
  465. transformers/models/vitdet/modeling_vitdet.py +2 -0
  466. transformers/models/vitmatte/modeling_vitmatte.py +1 -0
  467. transformers/models/vits/modeling_vits.py +1 -0
  468. transformers/models/vjepa2/modeling_vjepa2.py +1 -0
  469. transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
  470. transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +5 -0
  471. transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +5 -0
  472. transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +6 -0
  473. transformers/models/wavlm/modeling_wavlm.py +5 -0
  474. transformers/models/whisper/modeling_whisper.py +6 -0
  475. transformers/models/whisper/tokenization_whisper.py +4 -15
  476. transformers/models/x_clip/modeling_x_clip.py +3 -0
  477. transformers/models/xglm/modeling_xglm.py +1 -0
  478. transformers/models/xglm/tokenization_xglm.py +4 -9
  479. transformers/models/xlm/modeling_xlm.py +5 -0
  480. transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
  481. transformers/models/xlnet/tokenization_xlnet.py +3 -7
  482. transformers/models/yoso/modeling_yoso.py +6 -0
  483. transformers/models/zamba/modeling_zamba.py +2 -0
  484. transformers/models/zamba2/modeling_zamba2.py +4 -2
  485. transformers/models/zamba2/modular_zamba2.py +1 -1
  486. transformers/models/zoedepth/modeling_zoedepth.py +1 -0
  487. transformers/pipelines/__init__.py +2 -3
  488. transformers/pipelines/base.py +1 -9
  489. transformers/pipelines/document_question_answering.py +3 -1
  490. transformers/pipelines/text_generation.py +1 -1
  491. transformers/processing_utils.py +23 -11
  492. transformers/quantizers/base.py +35 -110
  493. transformers/quantizers/quantizer_aqlm.py +1 -5
  494. transformers/quantizers/quantizer_auto_round.py +1 -2
  495. transformers/quantizers/quantizer_awq.py +17 -81
  496. transformers/quantizers/quantizer_bitnet.py +3 -8
  497. transformers/quantizers/quantizer_bnb_4bit.py +13 -110
  498. transformers/quantizers/quantizer_bnb_8bit.py +16 -92
  499. transformers/quantizers/quantizer_compressed_tensors.py +1 -5
  500. transformers/quantizers/quantizer_eetq.py +14 -62
  501. transformers/quantizers/quantizer_fbgemm_fp8.py +34 -125
  502. transformers/quantizers/quantizer_finegrained_fp8.py +13 -105
  503. transformers/quantizers/quantizer_fp_quant.py +48 -78
  504. transformers/quantizers/quantizer_gptq.py +7 -24
  505. transformers/quantizers/quantizer_higgs.py +40 -54
  506. transformers/quantizers/quantizer_hqq.py +144 -153
  507. transformers/quantizers/quantizer_mxfp4.py +13 -167
  508. transformers/quantizers/quantizer_quanto.py +20 -64
  509. transformers/quantizers/quantizer_quark.py +36 -17
  510. transformers/quantizers/quantizer_spqr.py +1 -4
  511. transformers/quantizers/quantizer_torchao.py +23 -202
  512. transformers/quantizers/quantizer_vptq.py +8 -22
  513. transformers/quantizers/quantizers_utils.py +20 -0
  514. transformers/testing_utils.py +297 -36
  515. transformers/tokenization_mistral_common.py +4 -0
  516. transformers/tokenization_utils_base.py +113 -222
  517. transformers/tokenization_utils_tokenizers.py +168 -107
  518. transformers/trainer.py +28 -31
  519. transformers/trainer_jit_checkpoint.py +126 -0
  520. transformers/trainer_utils.py +1 -1
  521. transformers/training_args.py +66 -28
  522. transformers/utils/__init__.py +3 -4
  523. transformers/utils/auto_docstring.py +1 -0
  524. transformers/utils/generic.py +27 -1
  525. transformers/utils/hub.py +5 -15
  526. transformers/utils/import_utils.py +61 -16
  527. transformers/utils/kernel_config.py +4 -2
  528. transformers/utils/loading_report.py +19 -10
  529. transformers/utils/quantization_config.py +75 -242
  530. transformers/video_processing_utils.py +1 -2
  531. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/METADATA +274 -227
  532. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/RECORD +536 -520
  533. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/WHEEL +1 -1
  534. transformers/kernels/__init__.py +0 -0
  535. transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
  536. transformers/models/roformer/tokenization_roformer_fast.py +0 -160
  537. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/entry_points.txt +0 -0
  538. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info/licenses}/LICENSE +0 -0
  539. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,273 @@
1
+ # Copyright 2025 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import Optional, Union
16
+
17
+ import torch
18
+ from torch import nn
19
+
20
+ from ...activations import ACT2FN
21
+ from ...configuration_utils import PreTrainedConfig
22
+ from ...utils import auto_docstring
23
+ from ..auto import CONFIG_MAPPING
24
+ from ..llava.configuration_llava import LlavaConfig
25
+ from ..llava.modeling_llava import (
26
+ LlavaForConditionalGeneration,
27
+ LlavaModel,
28
+ LlavaMultiModalProjector,
29
+ LlavaPreTrainedModel,
30
+ )
31
+
32
+
33
+ class FastVlmConfig(LlavaConfig):
34
+ r"""
35
+ This is the configuration class to store the configuration of a [`FastVlmForConditionalGeneration`]. It is used to instantiate a
36
+ FastVLM model according to the specified arguments, defining the model architecture. Instantiating a configuration
37
+ with the defaults will yield the same configuration as the one of FastVLM-7B.
38
+
39
+ e.g. [KamilaMila/FastVLM-7B](https://huggingface.co/KamilaMila/FastVLM-7B)
40
+
41
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
42
+ documentation from [`PretrainedConfig`] for more information.
43
+
44
+ Args:
45
+ vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `TimmWrapperConfig` for `fastvit_mci3`):
46
+ The config object or dictionary of the vision backbone.
47
+ text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `Qwen2Config`):
48
+ The config object or dictionary of the text backbone.
49
+ image_token_id (`int`, *optional*, defaults to 151646):
50
+ The image token index to encode the image prompt.
51
+ projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
52
+ The activation function used by the multimodal projector.
53
+ vision_feature_select_strategy (`str`, *optional*, defaults to `"full"`):
54
+ The feature selection strategy used to select the vision feature from the vision backbone.
55
+ Only "full" supported.
56
+ vision_feature_layer (`Union[int, list[int]]`, *optional*, defaults to -1):
57
+ The index of the layer to select the vision feature. If multiple indices are provided,
58
+ the vision feature of the corresponding indices will be concatenated to form the
59
+ vision features. Only -1 supported.
60
+ multimodal_projector_bias (`bool`, *optional*, defaults to `True`):
61
+ Whether to use bias in the multimodal projector.
62
+
63
+ Example:
64
+
65
+ ```python
66
+ >>> from transformers import FastVlmForConditionalGeneration, FastVlmConfig
67
+
68
+ >>> # Initializing a FastVLM-7B style configuration
69
+ >>> configuration = FastVlmConfig()
70
+
71
+ >>> # Initializing a model from the FastVLM-7B style configuration
72
+ >>> model = FastVlmForConditionalGeneration(configuration)
73
+
74
+ >>> # Accessing the model configuration
75
+ >>> configuration = model.config
76
+ ```"""
77
+
78
+ model_type = "fast_vlm"
79
+
80
+ def __init__(
81
+ self,
82
+ vision_config=None,
83
+ text_config=None,
84
+ image_token_id=151646,
85
+ projector_hidden_act="gelu",
86
+ vision_feature_select_strategy="full",
87
+ vision_feature_layer=-1,
88
+ multimodal_projector_bias=True,
89
+ **kwargs,
90
+ ):
91
+ self.image_token_id = image_token_id
92
+ self.projector_hidden_act = projector_hidden_act
93
+
94
+ if vision_feature_select_strategy != "full":
95
+ raise ValueError(
96
+ f"Unexpected select feature strategy: {vision_feature_select_strategy}. Only 'full' is supported in FastVLM."
97
+ )
98
+
99
+ if vision_feature_layer != -1:
100
+ raise ValueError(
101
+ f"Unexpected vision feature layer: {vision_feature_layer}. Only -1 is supported in FastVLM."
102
+ )
103
+
104
+ self.vision_feature_select_strategy = vision_feature_select_strategy
105
+ self.vision_feature_layer = vision_feature_layer
106
+
107
+ if isinstance(vision_config, dict):
108
+ vision_config["model_type"] = vision_config.get("model_type", "timm_wrapper")
109
+ vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
110
+ elif vision_config is None:
111
+ vision_config = CONFIG_MAPPING["timm_wrapper"](
112
+ architecture="fastvit_mci3",
113
+ do_pooling=True,
114
+ global_pool="avg",
115
+ hidden_size=3072,
116
+ initializer_range=0.02,
117
+ model_args={"inference_mode": True},
118
+ )
119
+
120
+ self.vision_config = vision_config
121
+
122
+ if isinstance(text_config, dict):
123
+ text_config["model_type"] = text_config.get("model_type", "qwen2")
124
+ text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
125
+ elif text_config is None:
126
+ text_config = CONFIG_MAPPING["qwen2"](
127
+ hidden_size=3584,
128
+ vocab_size=152128,
129
+ intermediate_size=18944,
130
+ num_attention_heads=28,
131
+ num_key_value_heads=4,
132
+ num_hidden_layers=28,
133
+ )
134
+
135
+ self.text_config = text_config
136
+ self.multimodal_projector_bias = multimodal_projector_bias
137
+
138
+ PreTrainedConfig.__init__(**kwargs)
139
+
140
+
141
+ class FastVlmMultiModalProjector(LlavaMultiModalProjector):
142
+ def __init__(self, config: FastVlmConfig):
143
+ nn.Module.__init__()
144
+ self.linear_1 = nn.Linear(
145
+ config.vision_config.hidden_size,
146
+ config.text_config.hidden_size,
147
+ bias=config.multimodal_projector_bias,
148
+ )
149
+ self.act = ACT2FN[config.projector_hidden_act]
150
+ self.linear_2 = nn.Linear(
151
+ config.text_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
152
+ )
153
+
154
+
155
+ class FastVlmPreTrainedModel(LlavaPreTrainedModel):
156
+ pass
157
+
158
+
159
+ class FastVlmModel(LlavaModel):
160
+ _checkpoint_conversion_mapping = {}
161
+
162
+ def __init__(self, config: FastVlmConfig):
163
+ super().__init__(config)
164
+
165
+ def get_image_features(
166
+ self,
167
+ pixel_values: torch.FloatTensor,
168
+ vision_feature_layer: Optional[Union[int, list[int]]] = None,
169
+ vision_feature_select_strategy: Optional[str] = None,
170
+ **kwargs,
171
+ ):
172
+ """
173
+ Obtains image last hidden states from the vision tower and apply multimodal projection.
174
+
175
+ Args:
176
+ pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`):
177
+ The tensors corresponding to the input images.
178
+ vision_feature_layer (`Union[int, list[int]]`, *optional*):
179
+ The index/indices of the layer to select the vision feature. Only -1 supported.
180
+ vision_feature_select_strategy (`str`, *optional*):
181
+ The feature selection strategy used to select the vision feature from the vision backbone.
182
+ Only "full" supported.
183
+ Returns:
184
+ image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
185
+ """
186
+ vision_feature_layer = (
187
+ vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
188
+ )
189
+ vision_feature_select_strategy = (
190
+ vision_feature_select_strategy
191
+ if vision_feature_select_strategy is not None
192
+ else self.config.vision_feature_select_strategy
193
+ )
194
+
195
+ kwargs = {k: v for k, v in kwargs.items() if v is not None}
196
+ image_outputs = self.vision_tower(pixel_values, **kwargs)
197
+
198
+ # since the vision tower is hybrid in FastVLM, its output needs to be handled differently from Llava
199
+ selected_image_feature = image_outputs.last_hidden_state
200
+ selected_image_feature = selected_image_feature.flatten(2).permute(0, 2, 1)
201
+ image_features = self.multi_modal_projector(selected_image_feature)
202
+ image_features = list(image_features)
203
+ return image_features
204
+
205
+ def forward(self, **super_kwargs):
206
+ r"""
207
+ vision_feature_layer (`Union[int, list[int], NoneType]`, *optional*):
208
+ The index of the layer to select the vision feature. If multiple indices are provided, the vision feature of the
209
+ corresponding indices will be concatenated to form the vision features. Only -1 supported.
210
+ vision_feature_select_strategy (`str`, *optional*):
211
+ The feature selection strategy used to select the vision feature from the vision backbone. Only "full" supported.
212
+ """
213
+ super().forward(**super_kwargs)
214
+
215
+
216
+ @auto_docstring(
217
+ custom_intro="""
218
+ The FastVlm model which consists of a vision backbone and a language model.
219
+ """
220
+ )
221
+ class FastVlmForConditionalGeneration(LlavaForConditionalGeneration):
222
+ _checkpoint_conversion_mapping = {}
223
+
224
+ def forward(self, **super_kwargs):
225
+ r"""
226
+ vision_feature_layer (`Union[int, list[int], NoneType]`, *optional*):
227
+ The index of the layer to select the vision feature. If multiple indices are provided, the vision feature of the
228
+ corresponding indices will be concatenated to form the vision features. Only -1 supported.
229
+ vision_feature_select_strategy (`str`, *optional*):
230
+ The feature selection strategy used to select the vision feature from the vision backbone. Only "full" supported.
231
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
232
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
233
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
234
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
235
+
236
+ Example:
237
+
238
+ ```python
239
+ >>> from PIL import Image
240
+ >>> import requests
241
+ >>> from transformers import AutoProcessor, AutoModelForImageTextToText
242
+ >>> import torch
243
+
244
+ >>> device = "cuda" if torch.cuda.is_available() else "cpu"
245
+
246
+ >>> model = AutoModelForImageTextToText.from_pretrained("KamilaMila/FastVLM-0.5B").to(device)
247
+ >>> processor = AutoProcessor.from_pretrained("KamilaMila/FastVLM-0.5B")
248
+
249
+ >>> conversation = [
250
+ {
251
+ "role": "user",
252
+ "content": [
253
+ {"type": "text", "text": "What are these?"},
254
+ {"type": "image"}
255
+ ]
256
+ }
257
+ ]
258
+
259
+ >>> prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
260
+ >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
261
+ >>> image = Image.open(requests.get(url, stream=True).raw)
262
+
263
+ >>> inputs = processor(images=image, text=prompt, return_tensors="pt").to(device)
264
+
265
+ >>> # Generate
266
+ >>> generated_ids = model.generate(**inputs, max_new_tokens=15)
267
+ >>> print(processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0])
268
+ system\n You are a helpful assistant.\n user\n What are these?\n assistant\n The image depicts a traditional Chinese street...
269
+ ```"""
270
+ super().forward(**super_kwargs)
271
+
272
+
273
+ __all__ = ["FastVlmForConditionalGeneration", "FastVlmModel", "FastVlmPreTrainedModel", "FastVlmConfig"]
@@ -514,7 +514,7 @@ class FastSpeech2ConformerConvolutionModule(nn.Module):
514
514
 
515
515
  Args:
516
516
  hidden_states (`torch.Tensor` of shape `(batch, time, channels)`): Input tensor.
517
- attention_mask (`torch.Tensor` of shape `(batch, 1, time)`): Attention mask.
517
+ attention_mask (`torch.Tensor` of shape `(batch, 1, time, time)`): Attention mask.
518
518
 
519
519
  Returns:
520
520
  `torch.Tensor`: Output tensor of shape `(batch, time, channels)`.
@@ -530,7 +530,10 @@ class FastSpeech2ConformerConvolutionModule(nn.Module):
530
530
 
531
531
  # Apply padding mask before convolution
532
532
  if attention_mask is not None:
533
- all_masked_rows = torch.all(~attention_mask, dim=-1)
533
+ if attention_mask.dtype == torch.bool:
534
+ all_masked_rows = torch.all(~attention_mask, dim=2)
535
+ else:
536
+ all_masked_rows = torch.all(~(attention_mask == 0.0), dim=2)
534
537
  hidden_states = hidden_states.masked_fill(all_masked_rows, 0.0)
535
538
 
536
539
  # 1D Depthwise Conv
@@ -1118,6 +1121,7 @@ class FastSpeech2ConformerModel(FastSpeech2ConformerPreTrainedModel):
1118
1121
  return_dict: Optional[bool] = None,
1119
1122
  output_attentions: Optional[bool] = None,
1120
1123
  output_hidden_states: Optional[bool] = None,
1124
+ **kwargs,
1121
1125
  ) -> Union[tuple, FastSpeech2ConformerModelOutput]:
1122
1126
  r"""
1123
1127
  input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
@@ -1433,7 +1437,7 @@ class FastSpeech2ConformerHifiGan(PreTrainedModel):
1433
1437
  waveform.
1434
1438
  """
1435
1439
  )
1436
- def forward(self, spectrogram: torch.FloatTensor) -> torch.FloatTensor:
1440
+ def forward(self, spectrogram: torch.FloatTensor, **kwargs) -> torch.FloatTensor:
1437
1441
  r"""
1438
1442
  spectrogram (`torch.FloatTensor`):
1439
1443
  Tensor containing the log-mel spectrograms. Can be batched and of shape `(batch_size, sequence_length,
@@ -1509,6 +1513,7 @@ class FastSpeech2ConformerWithHifiGan(PreTrainedModel):
1509
1513
  return_dict: Optional[bool] = None,
1510
1514
  output_attentions: Optional[bool] = None,
1511
1515
  output_hidden_states: Optional[bool] = None,
1516
+ **kwargs,
1512
1517
  ) -> Union[tuple, FastSpeech2ConformerModelOutput]:
1513
1518
  r"""
1514
1519
  input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
@@ -792,6 +792,7 @@ class FlaubertModel(FlaubertPreTrainedModel):
792
792
  output_hidden_states: Optional[bool] = None,
793
793
  return_dict: Optional[bool] = None,
794
794
  cache_position: Optional[torch.Tensor] = None,
795
+ **kwargs,
795
796
  ) -> Union[tuple, BaseModelOutput]:
796
797
  r"""
797
798
  langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -1002,6 +1003,7 @@ class FlaubertWithLMHeadModel(FlaubertPreTrainedModel, GenerationMixin):
1002
1003
  output_attentions: Optional[bool] = None,
1003
1004
  output_hidden_states: Optional[bool] = None,
1004
1005
  return_dict: Optional[bool] = None,
1006
+ **kwargs,
1005
1007
  ) -> Union[tuple, MaskedLMOutput]:
1006
1008
  r"""
1007
1009
  langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -1090,6 +1092,7 @@ class FlaubertForSequenceClassification(FlaubertPreTrainedModel):
1090
1092
  output_attentions: Optional[bool] = None,
1091
1093
  output_hidden_states: Optional[bool] = None,
1092
1094
  return_dict: Optional[bool] = None,
1095
+ **kwargs,
1093
1096
  ) -> Union[tuple, SequenceClassifierOutput]:
1094
1097
  r"""
1095
1098
  langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -1195,6 +1198,7 @@ class FlaubertForTokenClassification(FlaubertPreTrainedModel):
1195
1198
  output_attentions: Optional[bool] = None,
1196
1199
  output_hidden_states: Optional[bool] = None,
1197
1200
  return_dict: Optional[bool] = None,
1201
+ **kwargs,
1198
1202
  ) -> Union[tuple, TokenClassifierOutput]:
1199
1203
  r"""
1200
1204
  langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -1286,6 +1290,7 @@ class FlaubertForQuestionAnsweringSimple(FlaubertPreTrainedModel):
1286
1290
  output_attentions: Optional[bool] = None,
1287
1291
  output_hidden_states: Optional[bool] = None,
1288
1292
  return_dict: Optional[bool] = None,
1293
+ **kwargs,
1289
1294
  ) -> Union[tuple, QuestionAnsweringModelOutput]:
1290
1295
  r"""
1291
1296
  langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -1423,6 +1428,7 @@ class FlaubertForQuestionAnswering(FlaubertPreTrainedModel):
1423
1428
  output_attentions: Optional[bool] = None,
1424
1429
  output_hidden_states: Optional[bool] = None,
1425
1430
  return_dict: Optional[bool] = None,
1431
+ **kwargs,
1426
1432
  ) -> Union[tuple, FlaubertForQuestionAnsweringOutput]:
1427
1433
  r"""
1428
1434
  langs (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -1538,6 +1544,7 @@ class FlaubertForMultipleChoice(FlaubertPreTrainedModel):
1538
1544
  output_attentions: Optional[bool] = None,
1539
1545
  output_hidden_states: Optional[bool] = None,
1540
1546
  return_dict: Optional[bool] = None,
1547
+ **kwargs,
1541
1548
  ) -> Union[tuple, MultipleChoiceModelOutput]:
1542
1549
  r"""
1543
1550
  input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
@@ -725,6 +725,7 @@ class FlavaImageModel(FlavaPreTrainedModel):
725
725
  output_attentions: Optional[bool] = None,
726
726
  output_hidden_states: Optional[bool] = None,
727
727
  return_dict: Optional[bool] = None,
728
+ **kwargs,
728
729
  ) -> Union[tuple, BaseModelOutputWithPooling]:
729
730
  r"""
730
731
  bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, image_num_patches)`):
@@ -804,6 +805,7 @@ class FlavaTextModel(FlavaPreTrainedModel):
804
805
  output_attentions: Optional[bool] = None,
805
806
  output_hidden_states: Optional[bool] = None,
806
807
  return_dict: Optional[bool] = None,
808
+ **kwargs,
807
809
  ) -> Union[tuple, BaseModelOutputWithPooling]:
808
810
  r"""
809
811
  input_ids (`torch.LongTensor` of shape `(batch_size, text_seq_length)`):
@@ -896,6 +898,7 @@ class FlavaMultimodalModel(FlavaPreTrainedModel):
896
898
  output_attentions: Optional[bool] = None,
897
899
  output_hidden_states: Optional[bool] = None,
898
900
  return_dict: Optional[bool] = None,
901
+ **kwargs,
899
902
  ) -> Union[tuple, BaseModelOutputWithPooling]:
900
903
  r"""
901
904
  hidden_states (`torch.FloatTensor` of shape `(batch_size, image_num_patches + text_seq_len, hidden_size)`):
@@ -1103,6 +1106,7 @@ class FlavaModel(FlavaPreTrainedModel):
1103
1106
  output_attentions: Optional[bool] = None,
1104
1107
  output_hidden_states: bool = True,
1105
1108
  return_dict: Optional[bool] = None,
1109
+ **kwargs,
1106
1110
  ) -> Union[tuple, FlavaOutput]:
1107
1111
  r"""
1108
1112
  input_ids (`torch.LongTensor` of shape `(batch_size, image_num_patches + text_seq_len)`):
@@ -1380,7 +1384,7 @@ class FlavaImageCodebook(FlavaPreTrainedModel):
1380
1384
  z_logits = self.blocks(pixel_values)
1381
1385
  return nn.Softmax(dim=1)(z_logits)
1382
1386
 
1383
- def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
1387
+ def forward(self, pixel_values: torch.FloatTensor, **kwargs) -> torch.Tensor:
1384
1388
  f"""
1385
1389
  Args:
1386
1390
  pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
@@ -1575,6 +1579,7 @@ class FlavaForPreTraining(FlavaPreTrainedModel):
1575
1579
  output_hidden_states: bool = True,
1576
1580
  return_dict: Optional[bool] = None,
1577
1581
  return_loss: Optional[bool] = None,
1582
+ **kwargs,
1578
1583
  ) -> Union[tuple[torch.Tensor], FlavaForPreTrainingOutput]:
1579
1584
  r"""
1580
1585
  input_ids (`torch.LongTensor` of shape `(batch_size, text_seq_len)`):
@@ -30,7 +30,7 @@ from ... import initialization as init
30
30
  from ...activations import ACT2FN
31
31
  from ...cache_utils import Cache, DynamicCache
32
32
  from ...generation import GenerationMixin
33
- from ...integrations import use_kernel_forward_from_hub
33
+ from ...integrations import use_kernel_forward_from_hub, use_kernelized_func
34
34
  from ...masking_utils import create_causal_mask
35
35
  from ...modeling_layers import GradientCheckpointingLayer
36
36
  from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast
@@ -38,7 +38,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
38
38
  from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
39
39
  from ...processing_utils import Unpack
40
40
  from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
41
- from ...utils.generic import OutputRecorder, check_model_inputs
41
+ from ...utils.generic import OutputRecorder, check_model_inputs, maybe_autocast
42
42
  from .configuration_flex_olmo import FlexOlmoConfig
43
43
 
44
44
 
@@ -119,7 +119,7 @@ class FlexOlmoRotaryEmbedding(nn.Module):
119
119
  position_ids_expanded = position_ids[:, None, :].float()
120
120
 
121
121
  device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
122
- with torch.autocast(device_type=device_type, enabled=False): # Force float32
122
+ with maybe_autocast(device_type=device_type, enabled=False): # Force float32
123
123
  freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
124
124
  emb = torch.cat((freqs, freqs), dim=-1)
125
125
  cos = emb.cos() * self.attention_scaling
@@ -216,6 +216,7 @@ def rotate_half(x):
216
216
  return torch.cat((-x2, x1), dim=-1)
217
217
 
218
218
 
219
+ @use_kernelized_func(apply_rotary_pos_emb)
219
220
  class FlexOlmoAttention(nn.Module):
220
221
  """Multi-headed attention from 'Attention Is All You Need' paper"""
221
222
 
@@ -241,7 +242,6 @@ class FlexOlmoAttention(nn.Module):
241
242
  self.o_proj = nn.Linear(
242
243
  config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
243
244
  )
244
- self.rotary_fn = apply_rotary_pos_emb
245
245
  self.q_norm = FlexOlmoRMSNorm(config.num_attention_heads * self.head_dim, config.rms_norm_eps)
246
246
  self.k_norm = FlexOlmoRMSNorm(config.num_key_value_heads * self.head_dim, config.rms_norm_eps)
247
247
 
@@ -252,7 +252,6 @@ class FlexOlmoAttention(nn.Module):
252
252
  attention_mask: Optional[torch.Tensor],
253
253
  past_key_values: Optional[Cache] = None,
254
254
  cache_position: Optional[torch.LongTensor] = None,
255
- position_ids: Optional[torch.LongTensor] = None,
256
255
  **kwargs: Unpack[TransformersKwargs],
257
256
  ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
258
257
  input_shape = hidden_states.shape[:-1]
@@ -541,7 +541,7 @@ class Florence2VisionBackbone(Florence2VisionPreTrainedModel):
541
541
  # Initialize weights and apply final processing
542
542
  self.post_init()
543
543
 
544
- def forward(self, hidden_states: torch.Tensor):
544
+ def forward(self, hidden_states: torch.Tensor, **kwargs):
545
545
  for conv, block in zip(self.convs, self.blocks):
546
546
  hidden_states = conv(hidden_states)
547
547
  for layer in block:
@@ -708,6 +708,7 @@ class Florence2Model(Florence2PreTrainedModel):
708
708
  output_hidden_states: Optional[bool] = None,
709
709
  return_dict: Optional[bool] = None,
710
710
  cache_position: Optional[torch.LongTensor] = None,
711
+ **kwargs,
711
712
  ) -> Union[tuple, Florence2Seq2SeqModelOutput]:
712
713
  output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
713
714
  output_hidden_states = (
@@ -1422,7 +1422,7 @@ class Florence2VisionBackbone(Florence2VisionPreTrainedModel):
1422
1422
  # Initialize weights and apply final processing
1423
1423
  self.post_init()
1424
1424
 
1425
- def forward(self, hidden_states: torch.Tensor):
1425
+ def forward(self, hidden_states: torch.Tensor, **kwargs):
1426
1426
  for conv, block in zip(self.convs, self.blocks):
1427
1427
  hidden_states = conv(hidden_states)
1428
1428
  for layer in block:
@@ -1551,6 +1551,7 @@ class Florence2Model(LlavaModel):
1551
1551
  output_hidden_states: Optional[bool] = None,
1552
1552
  return_dict: Optional[bool] = None,
1553
1553
  cache_position: Optional[torch.LongTensor] = None,
1554
+ **kwargs,
1554
1555
  ) -> Union[tuple, Florence2Seq2SeqModelOutput]:
1555
1556
  output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1556
1557
  output_hidden_states = (
@@ -439,6 +439,7 @@ class FNetModel(FNetPreTrainedModel):
439
439
  inputs_embeds: Optional[torch.FloatTensor] = None,
440
440
  output_hidden_states: Optional[bool] = None,
441
441
  return_dict: Optional[bool] = None,
442
+ **kwargs,
442
443
  ) -> Union[tuple, BaseModelOutput]:
443
444
  output_hidden_states = (
444
445
  output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -540,6 +541,7 @@ class FNetForPreTraining(FNetPreTrainedModel):
540
541
  next_sentence_label: Optional[torch.Tensor] = None,
541
542
  output_hidden_states: Optional[bool] = None,
542
543
  return_dict: Optional[bool] = None,
544
+ **kwargs,
543
545
  ) -> Union[tuple, FNetForPreTrainingOutput]:
544
546
  r"""
545
547
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -632,6 +634,7 @@ class FNetForMaskedLM(FNetPreTrainedModel):
632
634
  labels: Optional[torch.Tensor] = None,
633
635
  output_hidden_states: Optional[bool] = None,
634
636
  return_dict: Optional[bool] = None,
637
+ **kwargs,
635
638
  ) -> Union[tuple, MaskedLMOutput]:
636
639
  r"""
637
640
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -783,6 +786,7 @@ class FNetForSequenceClassification(FNetPreTrainedModel):
783
786
  labels: Optional[torch.Tensor] = None,
784
787
  output_hidden_states: Optional[bool] = None,
785
788
  return_dict: Optional[bool] = None,
789
+ **kwargs,
786
790
  ) -> Union[tuple, SequenceClassifierOutput]:
787
791
  r"""
788
792
  labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
@@ -856,6 +860,7 @@ class FNetForMultipleChoice(FNetPreTrainedModel):
856
860
  labels: Optional[torch.Tensor] = None,
857
861
  output_hidden_states: Optional[bool] = None,
858
862
  return_dict: Optional[bool] = None,
863
+ **kwargs,
859
864
  ) -> Union[tuple, MultipleChoiceModelOutput]:
860
865
  r"""
861
866
  input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
@@ -950,6 +955,7 @@ class FNetForTokenClassification(FNetPreTrainedModel):
950
955
  labels: Optional[torch.Tensor] = None,
951
956
  output_hidden_states: Optional[bool] = None,
952
957
  return_dict: Optional[bool] = None,
958
+ **kwargs,
953
959
  ) -> Union[tuple, TokenClassifierOutput]:
954
960
  r"""
955
961
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -1008,6 +1014,7 @@ class FNetForQuestionAnswering(FNetPreTrainedModel):
1008
1014
  end_positions: Optional[torch.Tensor] = None,
1009
1015
  output_hidden_states: Optional[bool] = None,
1010
1016
  return_dict: Optional[bool] = None,
1017
+ **kwargs,
1011
1018
  ) -> Union[tuple, QuestionAnsweringModelOutput]:
1012
1019
  return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1013
1020
 
@@ -628,6 +628,7 @@ class FocalNetModel(FocalNetPreTrainedModel):
628
628
  bool_masked_pos: Optional[torch.BoolTensor] = None,
629
629
  output_hidden_states: Optional[bool] = None,
630
630
  return_dict: Optional[bool] = None,
631
+ **kwargs,
631
632
  ) -> Union[tuple, FocalNetModelOutput]:
632
633
  r"""
633
634
  bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
@@ -710,6 +711,7 @@ class FocalNetForMaskedImageModeling(FocalNetPreTrainedModel):
710
711
  bool_masked_pos: Optional[torch.BoolTensor] = None,
711
712
  output_hidden_states: Optional[bool] = None,
712
713
  return_dict: Optional[bool] = None,
714
+ **kwargs,
713
715
  ) -> Union[tuple, FocalNetMaskedImageModelingOutput]:
714
716
  r"""
715
717
  bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
@@ -812,6 +814,7 @@ class FocalNetForImageClassification(FocalNetPreTrainedModel):
812
814
  labels: Optional[torch.LongTensor] = None,
813
815
  output_hidden_states: Optional[bool] = None,
814
816
  return_dict: Optional[bool] = None,
817
+ **kwargs,
815
818
  ) -> Union[tuple, FocalNetImageClassifierOutput]:
816
819
  r"""
817
820
  labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
@@ -871,6 +874,7 @@ class FocalNetBackbone(FocalNetPreTrainedModel, BackboneMixin):
871
874
  pixel_values: torch.Tensor,
872
875
  output_hidden_states: Optional[bool] = None,
873
876
  return_dict: Optional[bool] = None,
877
+ **kwargs,
874
878
  ) -> BackboneOutput:
875
879
  r"""
876
880
  Examples:
@@ -843,6 +843,7 @@ class FSMTModel(PretrainedFSMTModel):
843
843
  decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
844
844
  return_dict: Optional[bool] = None,
845
845
  cache_position: Optional[torch.Tensor] = None,
846
+ **kwargs,
846
847
  ) -> Union[tuple[torch.Tensor], Seq2SeqModelOutput]:
847
848
  r"""
848
849
  decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
@@ -980,6 +981,7 @@ class FSMTForConditionalGeneration(PretrainedFSMTModel, GenerationMixin):
980
981
  output_hidden_states: Optional[bool] = None,
981
982
  return_dict: Optional[bool] = None,
982
983
  cache_position: Optional[torch.Tensor] = None,
984
+ **kwargs,
983
985
  ) -> Union[tuple[torch.Tensor], Seq2SeqLMOutput]:
984
986
  r"""
985
987
  decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
@@ -766,6 +766,7 @@ class FunnelBaseModel(FunnelPreTrainedModel):
766
766
  output_attentions: Optional[bool] = None,
767
767
  output_hidden_states: Optional[bool] = None,
768
768
  return_dict: Optional[bool] = None,
769
+ **kwargs,
769
770
  ) -> Union[tuple, BaseModelOutput]:
770
771
  output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
771
772
  output_hidden_states = (
@@ -832,6 +833,7 @@ class FunnelModel(FunnelPreTrainedModel):
832
833
  output_attentions: Optional[bool] = None,
833
834
  output_hidden_states: Optional[bool] = None,
834
835
  return_dict: Optional[bool] = None,
836
+ **kwargs,
835
837
  ) -> Union[tuple, BaseModelOutput]:
836
838
  output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
837
839
  output_hidden_states = (
@@ -923,6 +925,7 @@ class FunnelForPreTraining(FunnelPreTrainedModel):
923
925
  output_attentions: Optional[bool] = None,
924
926
  output_hidden_states: Optional[bool] = None,
925
927
  return_dict: Optional[bool] = None,
928
+ **kwargs,
926
929
  ) -> Union[tuple, FunnelForPreTrainingOutput]:
927
930
  r"""
928
931
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -1012,6 +1015,7 @@ class FunnelForMaskedLM(FunnelPreTrainedModel):
1012
1015
  output_attentions: Optional[bool] = None,
1013
1016
  output_hidden_states: Optional[bool] = None,
1014
1017
  return_dict: Optional[bool] = None,
1018
+ **kwargs,
1015
1019
  ) -> Union[tuple, MaskedLMOutput]:
1016
1020
  r"""
1017
1021
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -1079,6 +1083,7 @@ class FunnelForSequenceClassification(FunnelPreTrainedModel):
1079
1083
  output_attentions: Optional[bool] = None,
1080
1084
  output_hidden_states: Optional[bool] = None,
1081
1085
  return_dict: Optional[bool] = None,
1086
+ **kwargs,
1082
1087
  ) -> Union[tuple, SequenceClassifierOutput]:
1083
1088
  r"""
1084
1089
  labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
@@ -1158,6 +1163,7 @@ class FunnelForMultipleChoice(FunnelPreTrainedModel):
1158
1163
  output_attentions: Optional[bool] = None,
1159
1164
  output_hidden_states: Optional[bool] = None,
1160
1165
  return_dict: Optional[bool] = None,
1166
+ **kwargs,
1161
1167
  ) -> Union[tuple, MultipleChoiceModelOutput]:
1162
1168
  r"""
1163
1169
  labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
@@ -1233,6 +1239,7 @@ class FunnelForTokenClassification(FunnelPreTrainedModel):
1233
1239
  output_attentions: Optional[bool] = None,
1234
1240
  output_hidden_states: Optional[bool] = None,
1235
1241
  return_dict: Optional[bool] = None,
1242
+ **kwargs,
1236
1243
  ) -> Union[tuple, TokenClassifierOutput]:
1237
1244
  r"""
1238
1245
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -1295,6 +1302,7 @@ class FunnelForQuestionAnswering(FunnelPreTrainedModel):
1295
1302
  output_attentions: Optional[bool] = None,
1296
1303
  output_hidden_states: Optional[bool] = None,
1297
1304
  return_dict: Optional[bool] = None,
1305
+ **kwargs,
1298
1306
  ) -> Union[tuple, QuestionAnsweringModelOutput]:
1299
1307
  return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1300
1308