transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1584) hide show
  1. transformers/__init__.py +27 -27
  2. transformers/activations.py +1 -1
  3. transformers/audio_utils.py +32 -33
  4. transformers/cache_utils.py +32 -139
  5. transformers/cli/chat.py +3 -3
  6. transformers/cli/serve.py +2 -2
  7. transformers/cli/transformers.py +2 -1
  8. transformers/configuration_utils.py +143 -101
  9. transformers/conversion_mapping.py +73 -6
  10. transformers/convert_slow_tokenizer.py +3 -8
  11. transformers/core_model_loading.py +215 -50
  12. transformers/data/processors/glue.py +0 -1
  13. transformers/data/processors/utils.py +0 -1
  14. transformers/data/processors/xnli.py +0 -1
  15. transformers/dependency_versions_table.py +5 -5
  16. transformers/distributed/configuration_utils.py +1 -2
  17. transformers/dynamic_module_utils.py +23 -23
  18. transformers/feature_extraction_sequence_utils.py +19 -23
  19. transformers/feature_extraction_utils.py +63 -31
  20. transformers/generation/candidate_generator.py +80 -33
  21. transformers/generation/configuration_utils.py +186 -131
  22. transformers/generation/continuous_batching/__init__.py +0 -1
  23. transformers/generation/continuous_batching/cache.py +81 -24
  24. transformers/generation/continuous_batching/cache_manager.py +155 -45
  25. transformers/generation/continuous_batching/continuous_api.py +152 -84
  26. transformers/generation/continuous_batching/requests.py +51 -3
  27. transformers/generation/continuous_batching/scheduler.py +127 -52
  28. transformers/generation/logits_process.py +0 -128
  29. transformers/generation/stopping_criteria.py +1 -1
  30. transformers/generation/streamers.py +0 -1
  31. transformers/generation/utils.py +107 -119
  32. transformers/generation/watermarking.py +8 -6
  33. transformers/hf_argparser.py +9 -13
  34. transformers/hyperparameter_search.py +1 -2
  35. transformers/image_processing_base.py +11 -21
  36. transformers/image_processing_utils.py +11 -12
  37. transformers/image_processing_utils_fast.py +68 -57
  38. transformers/image_transforms.py +29 -29
  39. transformers/image_utils.py +30 -32
  40. transformers/initialization.py +37 -0
  41. transformers/integrations/__init__.py +12 -0
  42. transformers/integrations/accelerate.py +44 -111
  43. transformers/integrations/aqlm.py +3 -5
  44. transformers/integrations/awq.py +3 -8
  45. transformers/integrations/bitnet.py +5 -8
  46. transformers/integrations/bitsandbytes.py +16 -15
  47. transformers/integrations/deepspeed.py +19 -4
  48. transformers/integrations/eetq.py +3 -6
  49. transformers/integrations/fbgemm_fp8.py +2 -3
  50. transformers/integrations/finegrained_fp8.py +14 -23
  51. transformers/integrations/flash_attention.py +2 -2
  52. transformers/integrations/flex_attention.py +1 -1
  53. transformers/integrations/fp_quant.py +4 -6
  54. transformers/integrations/ggml.py +0 -1
  55. transformers/integrations/higgs.py +2 -5
  56. transformers/integrations/hub_kernels.py +23 -5
  57. transformers/integrations/integration_utils.py +37 -3
  58. transformers/integrations/mistral.py +12 -0
  59. transformers/integrations/moe.py +240 -0
  60. transformers/integrations/mxfp4.py +9 -16
  61. transformers/integrations/peft.py +5 -0
  62. transformers/integrations/quanto.py +5 -2
  63. transformers/integrations/quark.py +2 -4
  64. transformers/integrations/spqr.py +3 -5
  65. transformers/integrations/tensor_parallel.py +167 -221
  66. transformers/integrations/torchao.py +4 -6
  67. transformers/integrations/vptq.py +3 -5
  68. transformers/loss/loss_lw_detr.py +356 -0
  69. transformers/loss/loss_utils.py +2 -0
  70. transformers/masking_utils.py +47 -51
  71. transformers/model_debugging_utils.py +4 -5
  72. transformers/modelcard.py +14 -192
  73. transformers/modeling_attn_mask_utils.py +19 -19
  74. transformers/modeling_flash_attention_utils.py +27 -27
  75. transformers/modeling_gguf_pytorch_utils.py +71 -24
  76. transformers/modeling_layers.py +21 -22
  77. transformers/modeling_outputs.py +242 -253
  78. transformers/modeling_rope_utils.py +110 -113
  79. transformers/modeling_utils.py +633 -576
  80. transformers/models/__init__.py +23 -0
  81. transformers/models/afmoe/configuration_afmoe.py +26 -29
  82. transformers/models/afmoe/modeling_afmoe.py +37 -49
  83. transformers/models/afmoe/modular_afmoe.py +21 -31
  84. transformers/models/aimv2/configuration_aimv2.py +2 -5
  85. transformers/models/aimv2/modeling_aimv2.py +24 -21
  86. transformers/models/aimv2/modular_aimv2.py +11 -9
  87. transformers/models/albert/configuration_albert.py +0 -1
  88. transformers/models/albert/modeling_albert.py +70 -69
  89. transformers/models/albert/tokenization_albert.py +1 -4
  90. transformers/models/align/configuration_align.py +0 -1
  91. transformers/models/align/modeling_align.py +73 -68
  92. transformers/models/align/processing_align.py +2 -30
  93. transformers/models/altclip/configuration_altclip.py +0 -1
  94. transformers/models/altclip/modeling_altclip.py +83 -80
  95. transformers/models/altclip/processing_altclip.py +2 -15
  96. transformers/models/apertus/__init__.py +0 -1
  97. transformers/models/apertus/configuration_apertus.py +18 -21
  98. transformers/models/apertus/modeling_apertus.py +35 -36
  99. transformers/models/apertus/modular_apertus.py +32 -31
  100. transformers/models/arcee/configuration_arcee.py +20 -23
  101. transformers/models/arcee/modeling_arcee.py +32 -35
  102. transformers/models/arcee/modular_arcee.py +20 -23
  103. transformers/models/aria/configuration_aria.py +20 -23
  104. transformers/models/aria/image_processing_aria.py +25 -27
  105. transformers/models/aria/modeling_aria.py +71 -70
  106. transformers/models/aria/modular_aria.py +85 -88
  107. transformers/models/aria/processing_aria.py +28 -35
  108. transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py +0 -1
  109. transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py +3 -6
  110. transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +6 -8
  111. transformers/models/audioflamingo3/__init__.py +0 -1
  112. transformers/models/audioflamingo3/configuration_audioflamingo3.py +0 -1
  113. transformers/models/audioflamingo3/modeling_audioflamingo3.py +22 -23
  114. transformers/models/audioflamingo3/modular_audioflamingo3.py +12 -17
  115. transformers/models/audioflamingo3/processing_audioflamingo3.py +33 -30
  116. transformers/models/auto/auto_factory.py +5 -6
  117. transformers/models/auto/configuration_auto.py +53 -5
  118. transformers/models/auto/feature_extraction_auto.py +12 -10
  119. transformers/models/auto/image_processing_auto.py +17 -28
  120. transformers/models/auto/modeling_auto.py +38 -188
  121. transformers/models/auto/processing_auto.py +6 -1
  122. transformers/models/auto/tokenization_auto.py +147 -169
  123. transformers/models/auto/video_processing_auto.py +12 -10
  124. transformers/models/autoformer/configuration_autoformer.py +4 -7
  125. transformers/models/autoformer/modeling_autoformer.py +98 -100
  126. transformers/models/aya_vision/configuration_aya_vision.py +0 -1
  127. transformers/models/aya_vision/modeling_aya_vision.py +42 -40
  128. transformers/models/aya_vision/modular_aya_vision.py +26 -29
  129. transformers/models/aya_vision/processing_aya_vision.py +25 -53
  130. transformers/models/bamba/configuration_bamba.py +29 -32
  131. transformers/models/bamba/modeling_bamba.py +78 -83
  132. transformers/models/bamba/modular_bamba.py +68 -71
  133. transformers/models/bark/configuration_bark.py +4 -7
  134. transformers/models/bark/generation_configuration_bark.py +3 -5
  135. transformers/models/bark/modeling_bark.py +49 -55
  136. transformers/models/bark/processing_bark.py +19 -41
  137. transformers/models/bart/configuration_bart.py +0 -2
  138. transformers/models/bart/modeling_bart.py +122 -117
  139. transformers/models/barthez/tokenization_barthez.py +1 -4
  140. transformers/models/bartpho/tokenization_bartpho.py +6 -7
  141. transformers/models/beit/configuration_beit.py +0 -11
  142. transformers/models/beit/image_processing_beit.py +53 -56
  143. transformers/models/beit/image_processing_beit_fast.py +8 -10
  144. transformers/models/beit/modeling_beit.py +51 -53
  145. transformers/models/bert/configuration_bert.py +0 -1
  146. transformers/models/bert/modeling_bert.py +114 -122
  147. transformers/models/bert/tokenization_bert.py +2 -4
  148. transformers/models/bert/tokenization_bert_legacy.py +3 -5
  149. transformers/models/bert_generation/configuration_bert_generation.py +0 -1
  150. transformers/models/bert_generation/modeling_bert_generation.py +49 -49
  151. transformers/models/bert_generation/tokenization_bert_generation.py +2 -3
  152. transformers/models/bert_japanese/tokenization_bert_japanese.py +5 -6
  153. transformers/models/bertweet/tokenization_bertweet.py +1 -3
  154. transformers/models/big_bird/configuration_big_bird.py +0 -1
  155. transformers/models/big_bird/modeling_big_bird.py +110 -109
  156. transformers/models/big_bird/tokenization_big_bird.py +1 -4
  157. transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py +0 -1
  158. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +116 -111
  159. transformers/models/biogpt/configuration_biogpt.py +0 -1
  160. transformers/models/biogpt/modeling_biogpt.py +69 -71
  161. transformers/models/biogpt/modular_biogpt.py +59 -61
  162. transformers/models/biogpt/tokenization_biogpt.py +3 -5
  163. transformers/models/bit/configuration_bit.py +0 -1
  164. transformers/models/bit/image_processing_bit.py +21 -24
  165. transformers/models/bit/image_processing_bit_fast.py +0 -1
  166. transformers/models/bit/modeling_bit.py +14 -12
  167. transformers/models/bitnet/configuration_bitnet.py +18 -21
  168. transformers/models/bitnet/modeling_bitnet.py +32 -35
  169. transformers/models/bitnet/modular_bitnet.py +4 -6
  170. transformers/models/blenderbot/configuration_blenderbot.py +0 -1
  171. transformers/models/blenderbot/modeling_blenderbot.py +71 -95
  172. transformers/models/blenderbot/tokenization_blenderbot.py +6 -8
  173. transformers/models/blenderbot_small/configuration_blenderbot_small.py +0 -1
  174. transformers/models/blenderbot_small/modeling_blenderbot_small.py +73 -68
  175. transformers/models/blenderbot_small/tokenization_blenderbot_small.py +1 -3
  176. transformers/models/blip/configuration_blip.py +0 -1
  177. transformers/models/blip/image_processing_blip.py +17 -20
  178. transformers/models/blip/image_processing_blip_fast.py +0 -1
  179. transformers/models/blip/modeling_blip.py +62 -71
  180. transformers/models/blip/modeling_blip_text.py +71 -65
  181. transformers/models/blip/processing_blip.py +5 -36
  182. transformers/models/blip_2/configuration_blip_2.py +0 -1
  183. transformers/models/blip_2/modeling_blip_2.py +72 -71
  184. transformers/models/blip_2/processing_blip_2.py +8 -38
  185. transformers/models/bloom/configuration_bloom.py +0 -1
  186. transformers/models/bloom/modeling_bloom.py +71 -103
  187. transformers/models/blt/configuration_blt.py +71 -74
  188. transformers/models/blt/modeling_blt.py +235 -78
  189. transformers/models/blt/modular_blt.py +225 -62
  190. transformers/models/bridgetower/configuration_bridgetower.py +0 -1
  191. transformers/models/bridgetower/image_processing_bridgetower.py +34 -35
  192. transformers/models/bridgetower/image_processing_bridgetower_fast.py +7 -10
  193. transformers/models/bridgetower/modeling_bridgetower.py +113 -109
  194. transformers/models/bridgetower/processing_bridgetower.py +2 -16
  195. transformers/models/bros/configuration_bros.py +0 -1
  196. transformers/models/bros/modeling_bros.py +86 -80
  197. transformers/models/bros/processing_bros.py +2 -12
  198. transformers/models/byt5/tokenization_byt5.py +4 -6
  199. transformers/models/camembert/configuration_camembert.py +0 -1
  200. transformers/models/camembert/modeling_camembert.py +196 -195
  201. transformers/models/camembert/modular_camembert.py +51 -54
  202. transformers/models/camembert/tokenization_camembert.py +1 -4
  203. transformers/models/canine/configuration_canine.py +0 -1
  204. transformers/models/canine/modeling_canine.py +79 -75
  205. transformers/models/canine/tokenization_canine.py +2 -1
  206. transformers/models/chameleon/configuration_chameleon.py +24 -27
  207. transformers/models/chameleon/image_processing_chameleon.py +21 -24
  208. transformers/models/chameleon/image_processing_chameleon_fast.py +0 -1
  209. transformers/models/chameleon/modeling_chameleon.py +62 -60
  210. transformers/models/chameleon/processing_chameleon.py +16 -41
  211. transformers/models/chinese_clip/configuration_chinese_clip.py +0 -1
  212. transformers/models/chinese_clip/image_processing_chinese_clip.py +21 -24
  213. transformers/models/chinese_clip/image_processing_chinese_clip_fast.py +0 -1
  214. transformers/models/chinese_clip/modeling_chinese_clip.py +71 -69
  215. transformers/models/chinese_clip/processing_chinese_clip.py +2 -15
  216. transformers/models/clap/configuration_clap.py +0 -1
  217. transformers/models/clap/feature_extraction_clap.py +11 -12
  218. transformers/models/clap/modeling_clap.py +113 -104
  219. transformers/models/clap/processing_clap.py +2 -15
  220. transformers/models/clip/configuration_clip.py +0 -1
  221. transformers/models/clip/image_processing_clip.py +21 -24
  222. transformers/models/clip/image_processing_clip_fast.py +0 -1
  223. transformers/models/clip/modeling_clip.py +47 -46
  224. transformers/models/clip/processing_clip.py +2 -14
  225. transformers/models/clip/tokenization_clip.py +2 -5
  226. transformers/models/clipseg/configuration_clipseg.py +0 -1
  227. transformers/models/clipseg/modeling_clipseg.py +90 -87
  228. transformers/models/clipseg/processing_clipseg.py +8 -39
  229. transformers/models/clvp/configuration_clvp.py +1 -3
  230. transformers/models/clvp/feature_extraction_clvp.py +7 -10
  231. transformers/models/clvp/modeling_clvp.py +133 -118
  232. transformers/models/clvp/number_normalizer.py +1 -2
  233. transformers/models/clvp/processing_clvp.py +3 -20
  234. transformers/models/clvp/tokenization_clvp.py +0 -1
  235. transformers/models/code_llama/tokenization_code_llama.py +4 -7
  236. transformers/models/codegen/configuration_codegen.py +0 -1
  237. transformers/models/codegen/modeling_codegen.py +61 -52
  238. transformers/models/codegen/tokenization_codegen.py +5 -6
  239. transformers/models/cohere/configuration_cohere.py +20 -23
  240. transformers/models/cohere/modeling_cohere.py +36 -39
  241. transformers/models/cohere/modular_cohere.py +24 -28
  242. transformers/models/cohere/tokenization_cohere.py +5 -6
  243. transformers/models/cohere2/configuration_cohere2.py +21 -24
  244. transformers/models/cohere2/modeling_cohere2.py +35 -38
  245. transformers/models/cohere2/modular_cohere2.py +39 -41
  246. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +6 -8
  247. transformers/models/cohere2_vision/modeling_cohere2_vision.py +35 -33
  248. transformers/models/cohere2_vision/modular_cohere2_vision.py +21 -23
  249. transformers/models/cohere2_vision/processing_cohere2_vision.py +6 -36
  250. transformers/models/colpali/configuration_colpali.py +0 -1
  251. transformers/models/colpali/modeling_colpali.py +14 -16
  252. transformers/models/colpali/modular_colpali.py +11 -51
  253. transformers/models/colpali/processing_colpali.py +14 -52
  254. transformers/models/colqwen2/modeling_colqwen2.py +20 -22
  255. transformers/models/colqwen2/modular_colqwen2.py +29 -68
  256. transformers/models/colqwen2/processing_colqwen2.py +16 -52
  257. transformers/models/conditional_detr/configuration_conditional_detr.py +1 -2
  258. transformers/models/conditional_detr/image_processing_conditional_detr.py +64 -66
  259. transformers/models/conditional_detr/image_processing_conditional_detr_fast.py +22 -22
  260. transformers/models/conditional_detr/modeling_conditional_detr.py +82 -81
  261. transformers/models/conditional_detr/modular_conditional_detr.py +1 -3
  262. transformers/models/convbert/configuration_convbert.py +0 -1
  263. transformers/models/convbert/modeling_convbert.py +88 -87
  264. transformers/models/convbert/tokenization_convbert.py +0 -1
  265. transformers/models/convnext/configuration_convnext.py +0 -1
  266. transformers/models/convnext/image_processing_convnext.py +20 -23
  267. transformers/models/convnext/image_processing_convnext_fast.py +14 -19
  268. transformers/models/convnext/modeling_convnext.py +5 -8
  269. transformers/models/convnextv2/configuration_convnextv2.py +0 -1
  270. transformers/models/convnextv2/modeling_convnextv2.py +5 -8
  271. transformers/models/cpm/tokenization_cpm.py +6 -7
  272. transformers/models/cpm/tokenization_cpm_fast.py +3 -5
  273. transformers/models/cpmant/configuration_cpmant.py +0 -1
  274. transformers/models/cpmant/modeling_cpmant.py +38 -40
  275. transformers/models/cpmant/tokenization_cpmant.py +1 -3
  276. transformers/models/csm/configuration_csm.py +49 -51
  277. transformers/models/csm/generation_csm.py +31 -35
  278. transformers/models/csm/modeling_csm.py +81 -82
  279. transformers/models/csm/modular_csm.py +58 -58
  280. transformers/models/csm/processing_csm.py +25 -68
  281. transformers/models/ctrl/configuration_ctrl.py +0 -1
  282. transformers/models/ctrl/modeling_ctrl.py +52 -43
  283. transformers/models/ctrl/tokenization_ctrl.py +0 -1
  284. transformers/models/cvt/configuration_cvt.py +0 -1
  285. transformers/models/cvt/modeling_cvt.py +18 -16
  286. transformers/models/cwm/__init__.py +0 -1
  287. transformers/models/cwm/configuration_cwm.py +3 -5
  288. transformers/models/cwm/modeling_cwm.py +33 -35
  289. transformers/models/cwm/modular_cwm.py +10 -12
  290. transformers/models/d_fine/configuration_d_fine.py +3 -5
  291. transformers/models/d_fine/modeling_d_fine.py +127 -121
  292. transformers/models/d_fine/modular_d_fine.py +23 -13
  293. transformers/models/dab_detr/configuration_dab_detr.py +2 -3
  294. transformers/models/dab_detr/modeling_dab_detr.py +69 -71
  295. transformers/models/dac/configuration_dac.py +0 -1
  296. transformers/models/dac/feature_extraction_dac.py +6 -9
  297. transformers/models/dac/modeling_dac.py +21 -23
  298. transformers/models/data2vec/configuration_data2vec_audio.py +0 -1
  299. transformers/models/data2vec/configuration_data2vec_text.py +0 -1
  300. transformers/models/data2vec/configuration_data2vec_vision.py +0 -1
  301. transformers/models/data2vec/modeling_data2vec_audio.py +52 -56
  302. transformers/models/data2vec/modeling_data2vec_text.py +98 -93
  303. transformers/models/data2vec/modeling_data2vec_vision.py +41 -42
  304. transformers/models/data2vec/modular_data2vec_audio.py +6 -1
  305. transformers/models/data2vec/modular_data2vec_text.py +58 -54
  306. transformers/models/dbrx/configuration_dbrx.py +27 -20
  307. transformers/models/dbrx/modeling_dbrx.py +40 -43
  308. transformers/models/dbrx/modular_dbrx.py +31 -33
  309. transformers/models/deberta/configuration_deberta.py +0 -1
  310. transformers/models/deberta/modeling_deberta.py +59 -60
  311. transformers/models/deberta/tokenization_deberta.py +2 -5
  312. transformers/models/deberta_v2/configuration_deberta_v2.py +0 -1
  313. transformers/models/deberta_v2/modeling_deberta_v2.py +65 -65
  314. transformers/models/deberta_v2/tokenization_deberta_v2.py +1 -4
  315. transformers/models/decision_transformer/configuration_decision_transformer.py +0 -1
  316. transformers/models/decision_transformer/modeling_decision_transformer.py +56 -55
  317. transformers/models/deepseek_v2/configuration_deepseek_v2.py +34 -37
  318. transformers/models/deepseek_v2/modeling_deepseek_v2.py +39 -37
  319. transformers/models/deepseek_v2/modular_deepseek_v2.py +44 -44
  320. transformers/models/deepseek_v3/configuration_deepseek_v3.py +35 -38
  321. transformers/models/deepseek_v3/modeling_deepseek_v3.py +40 -38
  322. transformers/models/deepseek_v3/modular_deepseek_v3.py +10 -7
  323. transformers/models/deepseek_vl/configuration_deepseek_vl.py +2 -3
  324. transformers/models/deepseek_vl/image_processing_deepseek_vl.py +25 -26
  325. transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +7 -7
  326. transformers/models/deepseek_vl/modeling_deepseek_vl.py +40 -36
  327. transformers/models/deepseek_vl/modular_deepseek_vl.py +14 -43
  328. transformers/models/deepseek_vl/processing_deepseek_vl.py +10 -41
  329. transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py +3 -5
  330. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py +35 -35
  331. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +16 -20
  332. transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +42 -38
  333. transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +80 -99
  334. transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py +12 -44
  335. transformers/models/deformable_detr/configuration_deformable_detr.py +2 -3
  336. transformers/models/deformable_detr/image_processing_deformable_detr.py +59 -61
  337. transformers/models/deformable_detr/image_processing_deformable_detr_fast.py +17 -17
  338. transformers/models/deformable_detr/modeling_deformable_detr.py +67 -68
  339. transformers/models/deformable_detr/modular_deformable_detr.py +1 -3
  340. transformers/models/deit/configuration_deit.py +0 -1
  341. transformers/models/deit/image_processing_deit.py +18 -21
  342. transformers/models/deit/image_processing_deit_fast.py +0 -1
  343. transformers/models/deit/modeling_deit.py +16 -18
  344. transformers/models/depth_anything/configuration_depth_anything.py +2 -4
  345. transformers/models/depth_anything/modeling_depth_anything.py +5 -8
  346. transformers/models/depth_pro/configuration_depth_pro.py +0 -1
  347. transformers/models/depth_pro/image_processing_depth_pro.py +22 -23
  348. transformers/models/depth_pro/image_processing_depth_pro_fast.py +6 -8
  349. transformers/models/depth_pro/modeling_depth_pro.py +21 -23
  350. transformers/models/detr/configuration_detr.py +1 -2
  351. transformers/models/detr/image_processing_detr.py +64 -66
  352. transformers/models/detr/image_processing_detr_fast.py +22 -23
  353. transformers/models/detr/modeling_detr.py +78 -73
  354. transformers/models/dia/configuration_dia.py +5 -8
  355. transformers/models/dia/feature_extraction_dia.py +6 -9
  356. transformers/models/dia/generation_dia.py +42 -45
  357. transformers/models/dia/modeling_dia.py +73 -65
  358. transformers/models/dia/modular_dia.py +63 -54
  359. transformers/models/dia/processing_dia.py +39 -29
  360. transformers/models/dia/tokenization_dia.py +3 -6
  361. transformers/models/diffllama/configuration_diffllama.py +20 -23
  362. transformers/models/diffllama/modeling_diffllama.py +44 -47
  363. transformers/models/diffllama/modular_diffllama.py +17 -19
  364. transformers/models/dinat/configuration_dinat.py +0 -1
  365. transformers/models/dinat/modeling_dinat.py +40 -42
  366. transformers/models/dinov2/configuration_dinov2.py +0 -1
  367. transformers/models/dinov2/modeling_dinov2.py +11 -13
  368. transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py +1 -1
  369. transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py +12 -13
  370. transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py +5 -7
  371. transformers/models/dinov3_convnext/configuration_dinov3_convnext.py +4 -7
  372. transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +3 -6
  373. transformers/models/dinov3_vit/configuration_dinov3_vit.py +5 -8
  374. transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +5 -7
  375. transformers/models/dinov3_vit/modeling_dinov3_vit.py +17 -16
  376. transformers/models/dinov3_vit/modular_dinov3_vit.py +14 -13
  377. transformers/models/distilbert/configuration_distilbert.py +0 -1
  378. transformers/models/distilbert/modeling_distilbert.py +55 -55
  379. transformers/models/distilbert/tokenization_distilbert.py +0 -1
  380. transformers/models/doge/__init__.py +0 -1
  381. transformers/models/doge/configuration_doge.py +25 -28
  382. transformers/models/doge/modeling_doge.py +43 -46
  383. transformers/models/doge/modular_doge.py +57 -58
  384. transformers/models/donut/configuration_donut_swin.py +0 -1
  385. transformers/models/donut/image_processing_donut.py +26 -29
  386. transformers/models/donut/image_processing_donut_fast.py +5 -11
  387. transformers/models/donut/modeling_donut_swin.py +60 -58
  388. transformers/models/donut/processing_donut.py +5 -26
  389. transformers/models/dots1/configuration_dots1.py +27 -29
  390. transformers/models/dots1/modeling_dots1.py +45 -39
  391. transformers/models/dots1/modular_dots1.py +0 -1
  392. transformers/models/dpr/configuration_dpr.py +0 -1
  393. transformers/models/dpr/modeling_dpr.py +37 -39
  394. transformers/models/dpr/tokenization_dpr.py +7 -9
  395. transformers/models/dpr/tokenization_dpr_fast.py +7 -9
  396. transformers/models/dpt/configuration_dpt.py +1 -2
  397. transformers/models/dpt/image_processing_dpt.py +65 -66
  398. transformers/models/dpt/image_processing_dpt_fast.py +14 -16
  399. transformers/models/dpt/modeling_dpt.py +19 -21
  400. transformers/models/dpt/modular_dpt.py +11 -13
  401. transformers/models/edgetam/configuration_edgetam.py +1 -2
  402. transformers/models/edgetam/modeling_edgetam.py +44 -43
  403. transformers/models/edgetam/modular_edgetam.py +17 -20
  404. transformers/models/edgetam_video/__init__.py +0 -1
  405. transformers/models/edgetam_video/configuration_edgetam_video.py +0 -1
  406. transformers/models/edgetam_video/modeling_edgetam_video.py +131 -120
  407. transformers/models/edgetam_video/modular_edgetam_video.py +29 -37
  408. transformers/models/efficientloftr/configuration_efficientloftr.py +4 -5
  409. transformers/models/efficientloftr/image_processing_efficientloftr.py +14 -16
  410. transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +5 -6
  411. transformers/models/efficientloftr/modeling_efficientloftr.py +41 -30
  412. transformers/models/efficientloftr/modular_efficientloftr.py +1 -3
  413. transformers/models/efficientnet/configuration_efficientnet.py +0 -1
  414. transformers/models/efficientnet/image_processing_efficientnet.py +28 -32
  415. transformers/models/efficientnet/image_processing_efficientnet_fast.py +15 -17
  416. transformers/models/efficientnet/modeling_efficientnet.py +17 -15
  417. transformers/models/electra/configuration_electra.py +0 -1
  418. transformers/models/electra/modeling_electra.py +108 -103
  419. transformers/models/emu3/configuration_emu3.py +5 -7
  420. transformers/models/emu3/image_processing_emu3.py +44 -39
  421. transformers/models/emu3/modeling_emu3.py +67 -64
  422. transformers/models/emu3/modular_emu3.py +39 -35
  423. transformers/models/emu3/processing_emu3.py +18 -43
  424. transformers/models/encodec/configuration_encodec.py +2 -4
  425. transformers/models/encodec/feature_extraction_encodec.py +10 -13
  426. transformers/models/encodec/modeling_encodec.py +39 -29
  427. transformers/models/encoder_decoder/configuration_encoder_decoder.py +0 -1
  428. transformers/models/encoder_decoder/modeling_encoder_decoder.py +17 -19
  429. transformers/models/eomt/configuration_eomt.py +0 -1
  430. transformers/models/eomt/image_processing_eomt.py +53 -55
  431. transformers/models/eomt/image_processing_eomt_fast.py +59 -28
  432. transformers/models/eomt/modeling_eomt.py +23 -18
  433. transformers/models/eomt/modular_eomt.py +18 -13
  434. transformers/models/ernie/configuration_ernie.py +0 -1
  435. transformers/models/ernie/modeling_ernie.py +127 -132
  436. transformers/models/ernie/modular_ernie.py +97 -103
  437. transformers/models/ernie4_5/configuration_ernie4_5.py +18 -20
  438. transformers/models/ernie4_5/modeling_ernie4_5.py +32 -34
  439. transformers/models/ernie4_5/modular_ernie4_5.py +1 -3
  440. transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +27 -29
  441. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +52 -51
  442. transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +16 -44
  443. transformers/models/ernie4_5_vl_moe/__init__.py +31 -0
  444. transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +329 -0
  445. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +455 -0
  446. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +231 -0
  447. transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +1895 -0
  448. transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +1901 -0
  449. transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +249 -0
  450. transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +593 -0
  451. transformers/models/esm/configuration_esm.py +2 -4
  452. transformers/models/esm/modeling_esm.py +38 -34
  453. transformers/models/esm/modeling_esmfold.py +48 -45
  454. transformers/models/esm/openfold_utils/chunk_utils.py +6 -6
  455. transformers/models/esm/openfold_utils/loss.py +1 -2
  456. transformers/models/esm/openfold_utils/protein.py +13 -13
  457. transformers/models/esm/openfold_utils/tensor_utils.py +6 -6
  458. transformers/models/esm/tokenization_esm.py +2 -4
  459. transformers/models/evolla/configuration_evolla.py +29 -32
  460. transformers/models/evolla/modeling_evolla.py +67 -62
  461. transformers/models/evolla/modular_evolla.py +53 -47
  462. transformers/models/evolla/processing_evolla.py +23 -35
  463. transformers/models/exaone4/configuration_exaone4.py +19 -22
  464. transformers/models/exaone4/modeling_exaone4.py +33 -36
  465. transformers/models/exaone4/modular_exaone4.py +40 -42
  466. transformers/models/falcon/configuration_falcon.py +22 -25
  467. transformers/models/falcon/modeling_falcon.py +75 -78
  468. transformers/models/falcon_h1/configuration_falcon_h1.py +40 -43
  469. transformers/models/falcon_h1/modeling_falcon_h1.py +80 -78
  470. transformers/models/falcon_h1/modular_falcon_h1.py +54 -50
  471. transformers/models/falcon_mamba/configuration_falcon_mamba.py +0 -1
  472. transformers/models/falcon_mamba/modeling_falcon_mamba.py +50 -47
  473. transformers/models/falcon_mamba/modular_falcon_mamba.py +16 -14
  474. transformers/models/fast_vlm/configuration_fast_vlm.py +1 -0
  475. transformers/models/fast_vlm/modeling_fast_vlm.py +43 -39
  476. transformers/models/fast_vlm/modular_fast_vlm.py +2 -3
  477. transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py +2 -5
  478. transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +68 -57
  479. transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +2 -3
  480. transformers/models/flaubert/configuration_flaubert.py +0 -1
  481. transformers/models/flaubert/modeling_flaubert.py +138 -143
  482. transformers/models/flaubert/tokenization_flaubert.py +3 -5
  483. transformers/models/flava/configuration_flava.py +5 -6
  484. transformers/models/flava/image_processing_flava.py +66 -67
  485. transformers/models/flava/image_processing_flava_fast.py +42 -45
  486. transformers/models/flava/modeling_flava.py +111 -107
  487. transformers/models/flava/processing_flava.py +2 -12
  488. transformers/models/flex_olmo/__init__.py +0 -1
  489. transformers/models/flex_olmo/configuration_flex_olmo.py +23 -25
  490. transformers/models/flex_olmo/modeling_flex_olmo.py +44 -43
  491. transformers/models/flex_olmo/modular_flex_olmo.py +35 -37
  492. transformers/models/florence2/configuration_florence2.py +0 -1
  493. transformers/models/florence2/modeling_florence2.py +59 -43
  494. transformers/models/florence2/modular_florence2.py +65 -81
  495. transformers/models/florence2/processing_florence2.py +18 -47
  496. transformers/models/fnet/configuration_fnet.py +0 -1
  497. transformers/models/fnet/modeling_fnet.py +76 -80
  498. transformers/models/fnet/tokenization_fnet.py +0 -1
  499. transformers/models/focalnet/configuration_focalnet.py +0 -1
  500. transformers/models/focalnet/modeling_focalnet.py +39 -41
  501. transformers/models/fsmt/configuration_fsmt.py +0 -1
  502. transformers/models/fsmt/modeling_fsmt.py +47 -48
  503. transformers/models/fsmt/tokenization_fsmt.py +3 -5
  504. transformers/models/funnel/configuration_funnel.py +0 -1
  505. transformers/models/funnel/modeling_funnel.py +91 -93
  506. transformers/models/funnel/tokenization_funnel.py +2 -5
  507. transformers/models/fuyu/configuration_fuyu.py +23 -26
  508. transformers/models/fuyu/image_processing_fuyu.py +29 -31
  509. transformers/models/fuyu/image_processing_fuyu_fast.py +12 -13
  510. transformers/models/fuyu/modeling_fuyu.py +29 -30
  511. transformers/models/fuyu/processing_fuyu.py +23 -34
  512. transformers/models/gemma/configuration_gemma.py +20 -23
  513. transformers/models/gemma/modeling_gemma.py +42 -46
  514. transformers/models/gemma/modular_gemma.py +37 -40
  515. transformers/models/gemma/tokenization_gemma.py +3 -6
  516. transformers/models/gemma2/configuration_gemma2.py +25 -28
  517. transformers/models/gemma2/modeling_gemma2.py +35 -38
  518. transformers/models/gemma2/modular_gemma2.py +56 -58
  519. transformers/models/gemma3/configuration_gemma3.py +28 -29
  520. transformers/models/gemma3/image_processing_gemma3.py +29 -31
  521. transformers/models/gemma3/image_processing_gemma3_fast.py +9 -11
  522. transformers/models/gemma3/modeling_gemma3.py +112 -94
  523. transformers/models/gemma3/modular_gemma3.py +110 -91
  524. transformers/models/gemma3/processing_gemma3.py +5 -5
  525. transformers/models/gemma3n/configuration_gemma3n.py +12 -10
  526. transformers/models/gemma3n/feature_extraction_gemma3n.py +9 -11
  527. transformers/models/gemma3n/modeling_gemma3n.py +127 -98
  528. transformers/models/gemma3n/modular_gemma3n.py +117 -84
  529. transformers/models/gemma3n/processing_gemma3n.py +12 -26
  530. transformers/models/git/configuration_git.py +0 -1
  531. transformers/models/git/modeling_git.py +250 -197
  532. transformers/models/git/processing_git.py +2 -14
  533. transformers/models/glm/configuration_glm.py +19 -21
  534. transformers/models/glm/modeling_glm.py +33 -36
  535. transformers/models/glm/modular_glm.py +4 -7
  536. transformers/models/glm4/configuration_glm4.py +19 -21
  537. transformers/models/glm4/modeling_glm4.py +36 -38
  538. transformers/models/glm4/modular_glm4.py +8 -10
  539. transformers/models/glm46v/configuration_glm46v.py +0 -1
  540. transformers/models/glm46v/image_processing_glm46v.py +35 -40
  541. transformers/models/glm46v/image_processing_glm46v_fast.py +7 -7
  542. transformers/models/glm46v/modeling_glm46v.py +54 -52
  543. transformers/models/glm46v/modular_glm46v.py +4 -3
  544. transformers/models/glm46v/processing_glm46v.py +7 -41
  545. transformers/models/glm46v/video_processing_glm46v.py +9 -11
  546. transformers/models/glm4_moe/configuration_glm4_moe.py +25 -28
  547. transformers/models/glm4_moe/modeling_glm4_moe.py +41 -40
  548. transformers/models/glm4_moe/modular_glm4_moe.py +27 -30
  549. transformers/models/glm4_moe_lite/__init__.py +28 -0
  550. transformers/models/glm4_moe_lite/configuration_glm4_moe_lite.py +235 -0
  551. transformers/models/glm4_moe_lite/modeling_glm4_moe_lite.py +740 -0
  552. transformers/models/glm4_moe_lite/modular_glm4_moe_lite.py +304 -0
  553. transformers/models/glm4v/configuration_glm4v.py +14 -17
  554. transformers/models/glm4v/image_processing_glm4v.py +34 -40
  555. transformers/models/glm4v/image_processing_glm4v_fast.py +6 -7
  556. transformers/models/glm4v/modeling_glm4v.py +148 -156
  557. transformers/models/glm4v/modular_glm4v.py +142 -185
  558. transformers/models/glm4v/processing_glm4v.py +7 -41
  559. transformers/models/glm4v/video_processing_glm4v.py +9 -11
  560. transformers/models/glm4v_moe/configuration_glm4v_moe.py +119 -122
  561. transformers/models/glm4v_moe/modeling_glm4v_moe.py +275 -319
  562. transformers/models/glm4v_moe/modular_glm4v_moe.py +66 -163
  563. transformers/models/glm_image/__init__.py +31 -0
  564. transformers/models/glm_image/configuration_glm_image.py +352 -0
  565. transformers/models/glm_image/image_processing_glm_image.py +503 -0
  566. transformers/models/glm_image/image_processing_glm_image_fast.py +296 -0
  567. transformers/models/glm_image/modeling_glm_image.py +1590 -0
  568. transformers/models/glm_image/modular_glm_image.py +1480 -0
  569. transformers/models/glm_image/processing_glm_image.py +217 -0
  570. transformers/models/glmasr/__init__.py +29 -0
  571. transformers/models/glmasr/configuration_glmasr.py +196 -0
  572. transformers/models/glmasr/modeling_glmasr.py +511 -0
  573. transformers/models/glmasr/modular_glmasr.py +431 -0
  574. transformers/models/glmasr/processing_glmasr.py +331 -0
  575. transformers/models/glpn/configuration_glpn.py +0 -1
  576. transformers/models/glpn/image_processing_glpn.py +11 -12
  577. transformers/models/glpn/image_processing_glpn_fast.py +8 -10
  578. transformers/models/glpn/modeling_glpn.py +10 -12
  579. transformers/models/got_ocr2/configuration_got_ocr2.py +5 -8
  580. transformers/models/got_ocr2/image_processing_got_ocr2.py +22 -24
  581. transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +6 -8
  582. transformers/models/got_ocr2/modeling_got_ocr2.py +48 -45
  583. transformers/models/got_ocr2/modular_got_ocr2.py +31 -34
  584. transformers/models/got_ocr2/processing_got_ocr2.py +42 -63
  585. transformers/models/gpt2/configuration_gpt2.py +0 -1
  586. transformers/models/gpt2/modeling_gpt2.py +114 -113
  587. transformers/models/gpt2/tokenization_gpt2.py +6 -9
  588. transformers/models/gpt_bigcode/configuration_gpt_bigcode.py +0 -1
  589. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +76 -88
  590. transformers/models/gpt_neo/configuration_gpt_neo.py +0 -1
  591. transformers/models/gpt_neo/modeling_gpt_neo.py +77 -66
  592. transformers/models/gpt_neox/configuration_gpt_neox.py +19 -22
  593. transformers/models/gpt_neox/modeling_gpt_neox.py +71 -73
  594. transformers/models/gpt_neox/modular_gpt_neox.py +64 -66
  595. transformers/models/gpt_neox/tokenization_gpt_neox.py +2 -5
  596. transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +15 -18
  597. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +42 -45
  598. transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py +1 -3
  599. transformers/models/gpt_oss/configuration_gpt_oss.py +38 -24
  600. transformers/models/gpt_oss/modeling_gpt_oss.py +40 -44
  601. transformers/models/gpt_oss/modular_gpt_oss.py +22 -26
  602. transformers/models/gpt_sw3/tokenization_gpt_sw3.py +4 -4
  603. transformers/models/gptj/configuration_gptj.py +0 -1
  604. transformers/models/gptj/modeling_gptj.py +96 -86
  605. transformers/models/granite/configuration_granite.py +23 -26
  606. transformers/models/granite/modeling_granite.py +40 -42
  607. transformers/models/granite/modular_granite.py +29 -31
  608. transformers/models/granite_speech/configuration_granite_speech.py +0 -1
  609. transformers/models/granite_speech/feature_extraction_granite_speech.py +1 -3
  610. transformers/models/granite_speech/modeling_granite_speech.py +36 -24
  611. transformers/models/granite_speech/processing_granite_speech.py +11 -4
  612. transformers/models/granitemoe/configuration_granitemoe.py +26 -29
  613. transformers/models/granitemoe/modeling_granitemoe.py +37 -40
  614. transformers/models/granitemoe/modular_granitemoe.py +22 -25
  615. transformers/models/granitemoehybrid/__init__.py +0 -1
  616. transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +41 -40
  617. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +92 -86
  618. transformers/models/granitemoehybrid/modular_granitemoehybrid.py +29 -21
  619. transformers/models/granitemoeshared/configuration_granitemoeshared.py +27 -30
  620. transformers/models/granitemoeshared/modeling_granitemoeshared.py +50 -55
  621. transformers/models/granitemoeshared/modular_granitemoeshared.py +19 -21
  622. transformers/models/grounding_dino/configuration_grounding_dino.py +2 -4
  623. transformers/models/grounding_dino/image_processing_grounding_dino.py +60 -62
  624. transformers/models/grounding_dino/image_processing_grounding_dino_fast.py +17 -18
  625. transformers/models/grounding_dino/modeling_grounding_dino.py +95 -97
  626. transformers/models/grounding_dino/modular_grounding_dino.py +2 -3
  627. transformers/models/grounding_dino/processing_grounding_dino.py +10 -38
  628. transformers/models/groupvit/configuration_groupvit.py +0 -1
  629. transformers/models/groupvit/modeling_groupvit.py +75 -71
  630. transformers/models/helium/configuration_helium.py +20 -22
  631. transformers/models/helium/modeling_helium.py +34 -37
  632. transformers/models/helium/modular_helium.py +3 -7
  633. transformers/models/herbert/tokenization_herbert.py +4 -6
  634. transformers/models/hgnet_v2/configuration_hgnet_v2.py +0 -1
  635. transformers/models/hgnet_v2/modeling_hgnet_v2.py +16 -9
  636. transformers/models/hgnet_v2/modular_hgnet_v2.py +16 -9
  637. transformers/models/hiera/configuration_hiera.py +0 -1
  638. transformers/models/hiera/modeling_hiera.py +60 -62
  639. transformers/models/hubert/configuration_hubert.py +0 -1
  640. transformers/models/hubert/modeling_hubert.py +39 -37
  641. transformers/models/hubert/modular_hubert.py +12 -11
  642. transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +21 -24
  643. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +31 -34
  644. transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +4 -6
  645. transformers/models/hunyuan_v1_moe/__init__.py +1 -1
  646. transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +25 -28
  647. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +44 -39
  648. transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +9 -9
  649. transformers/models/ibert/configuration_ibert.py +0 -1
  650. transformers/models/ibert/modeling_ibert.py +76 -62
  651. transformers/models/ibert/quant_modules.py +0 -1
  652. transformers/models/idefics/configuration_idefics.py +0 -1
  653. transformers/models/idefics/image_processing_idefics.py +13 -15
  654. transformers/models/idefics/modeling_idefics.py +70 -61
  655. transformers/models/idefics/perceiver.py +1 -3
  656. transformers/models/idefics/processing_idefics.py +32 -48
  657. transformers/models/idefics/vision.py +22 -24
  658. transformers/models/idefics2/configuration_idefics2.py +0 -1
  659. transformers/models/idefics2/image_processing_idefics2.py +31 -32
  660. transformers/models/idefics2/image_processing_idefics2_fast.py +7 -8
  661. transformers/models/idefics2/modeling_idefics2.py +63 -59
  662. transformers/models/idefics2/processing_idefics2.py +10 -68
  663. transformers/models/idefics3/configuration_idefics3.py +0 -1
  664. transformers/models/idefics3/image_processing_idefics3.py +42 -43
  665. transformers/models/idefics3/image_processing_idefics3_fast.py +11 -12
  666. transformers/models/idefics3/modeling_idefics3.py +57 -55
  667. transformers/models/idefics3/processing_idefics3.py +15 -69
  668. transformers/models/ijepa/configuration_ijepa.py +0 -1
  669. transformers/models/ijepa/modeling_ijepa.py +10 -11
  670. transformers/models/ijepa/modular_ijepa.py +5 -7
  671. transformers/models/imagegpt/configuration_imagegpt.py +0 -1
  672. transformers/models/imagegpt/image_processing_imagegpt.py +17 -18
  673. transformers/models/imagegpt/image_processing_imagegpt_fast.py +9 -14
  674. transformers/models/imagegpt/modeling_imagegpt.py +66 -60
  675. transformers/models/informer/configuration_informer.py +6 -9
  676. transformers/models/informer/modeling_informer.py +84 -86
  677. transformers/models/informer/modular_informer.py +13 -16
  678. transformers/models/instructblip/configuration_instructblip.py +0 -1
  679. transformers/models/instructblip/modeling_instructblip.py +45 -44
  680. transformers/models/instructblip/processing_instructblip.py +10 -36
  681. transformers/models/instructblipvideo/configuration_instructblipvideo.py +0 -1
  682. transformers/models/instructblipvideo/modeling_instructblipvideo.py +107 -105
  683. transformers/models/instructblipvideo/modular_instructblipvideo.py +34 -36
  684. transformers/models/instructblipvideo/processing_instructblipvideo.py +14 -33
  685. transformers/models/instructblipvideo/video_processing_instructblipvideo.py +4 -6
  686. transformers/models/internvl/configuration_internvl.py +0 -1
  687. transformers/models/internvl/modeling_internvl.py +52 -51
  688. transformers/models/internvl/modular_internvl.py +24 -30
  689. transformers/models/internvl/processing_internvl.py +12 -45
  690. transformers/models/internvl/video_processing_internvl.py +8 -10
  691. transformers/models/jais2/__init__.py +27 -0
  692. transformers/models/jais2/configuration_jais2.py +150 -0
  693. transformers/models/jais2/modeling_jais2.py +484 -0
  694. transformers/models/jais2/modular_jais2.py +194 -0
  695. transformers/models/jamba/configuration_jamba.py +0 -1
  696. transformers/models/jamba/modeling_jamba.py +67 -65
  697. transformers/models/jamba/modular_jamba.py +54 -55
  698. transformers/models/janus/configuration_janus.py +0 -1
  699. transformers/models/janus/image_processing_janus.py +35 -37
  700. transformers/models/janus/image_processing_janus_fast.py +12 -14
  701. transformers/models/janus/modeling_janus.py +56 -50
  702. transformers/models/janus/modular_janus.py +76 -70
  703. transformers/models/janus/processing_janus.py +17 -43
  704. transformers/models/jetmoe/configuration_jetmoe.py +20 -23
  705. transformers/models/jetmoe/modeling_jetmoe.py +41 -44
  706. transformers/models/jetmoe/modular_jetmoe.py +31 -33
  707. transformers/models/kosmos2/configuration_kosmos2.py +0 -1
  708. transformers/models/kosmos2/modeling_kosmos2.py +159 -148
  709. transformers/models/kosmos2/processing_kosmos2.py +40 -55
  710. transformers/models/kosmos2_5/__init__.py +0 -1
  711. transformers/models/kosmos2_5/configuration_kosmos2_5.py +0 -1
  712. transformers/models/kosmos2_5/image_processing_kosmos2_5.py +10 -12
  713. transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +4 -13
  714. transformers/models/kosmos2_5/modeling_kosmos2_5.py +118 -110
  715. transformers/models/kosmos2_5/processing_kosmos2_5.py +8 -29
  716. transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +23 -25
  717. transformers/models/kyutai_speech_to_text/feature_extraction_kyutai_speech_to_text.py +12 -14
  718. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +67 -68
  719. transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +28 -22
  720. transformers/models/kyutai_speech_to_text/processing_kyutai_speech_to_text.py +2 -8
  721. transformers/models/lasr/configuration_lasr.py +5 -3
  722. transformers/models/lasr/feature_extraction_lasr.py +10 -12
  723. transformers/models/lasr/modeling_lasr.py +21 -23
  724. transformers/models/lasr/modular_lasr.py +16 -11
  725. transformers/models/lasr/processing_lasr.py +12 -8
  726. transformers/models/lasr/tokenization_lasr.py +2 -4
  727. transformers/models/layoutlm/configuration_layoutlm.py +0 -1
  728. transformers/models/layoutlm/modeling_layoutlm.py +72 -72
  729. transformers/models/layoutlmv2/configuration_layoutlmv2.py +0 -1
  730. transformers/models/layoutlmv2/image_processing_layoutlmv2.py +18 -21
  731. transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +5 -7
  732. transformers/models/layoutlmv2/modeling_layoutlmv2.py +60 -50
  733. transformers/models/layoutlmv2/processing_layoutlmv2.py +14 -44
  734. transformers/models/layoutlmv2/tokenization_layoutlmv2.py +64 -74
  735. transformers/models/layoutlmv3/configuration_layoutlmv3.py +0 -1
  736. transformers/models/layoutlmv3/image_processing_layoutlmv3.py +24 -26
  737. transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +7 -9
  738. transformers/models/layoutlmv3/modeling_layoutlmv3.py +78 -56
  739. transformers/models/layoutlmv3/processing_layoutlmv3.py +14 -46
  740. transformers/models/layoutlmv3/tokenization_layoutlmv3.py +64 -75
  741. transformers/models/layoutxlm/configuration_layoutxlm.py +0 -1
  742. transformers/models/layoutxlm/modular_layoutxlm.py +0 -1
  743. transformers/models/layoutxlm/processing_layoutxlm.py +14 -44
  744. transformers/models/layoutxlm/tokenization_layoutxlm.py +65 -76
  745. transformers/models/led/configuration_led.py +1 -4
  746. transformers/models/led/modeling_led.py +119 -267
  747. transformers/models/levit/configuration_levit.py +0 -1
  748. transformers/models/levit/image_processing_levit.py +19 -21
  749. transformers/models/levit/image_processing_levit_fast.py +0 -1
  750. transformers/models/levit/modeling_levit.py +35 -19
  751. transformers/models/lfm2/configuration_lfm2.py +22 -23
  752. transformers/models/lfm2/modeling_lfm2.py +43 -45
  753. transformers/models/lfm2/modular_lfm2.py +29 -29
  754. transformers/models/lfm2_moe/__init__.py +0 -1
  755. transformers/models/lfm2_moe/configuration_lfm2_moe.py +1 -2
  756. transformers/models/lfm2_moe/modeling_lfm2_moe.py +58 -49
  757. transformers/models/lfm2_moe/modular_lfm2_moe.py +13 -37
  758. transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -1
  759. transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py +34 -5
  760. transformers/models/lfm2_vl/modeling_lfm2_vl.py +42 -38
  761. transformers/models/lfm2_vl/modular_lfm2_vl.py +28 -29
  762. transformers/models/lfm2_vl/processing_lfm2_vl.py +96 -76
  763. transformers/models/lightglue/image_processing_lightglue.py +16 -15
  764. transformers/models/lightglue/image_processing_lightglue_fast.py +5 -6
  765. transformers/models/lightglue/modeling_lightglue.py +28 -30
  766. transformers/models/lightglue/modular_lightglue.py +28 -28
  767. transformers/models/lighton_ocr/__init__.py +28 -0
  768. transformers/models/lighton_ocr/configuration_lighton_ocr.py +128 -0
  769. transformers/models/lighton_ocr/modeling_lighton_ocr.py +460 -0
  770. transformers/models/lighton_ocr/modular_lighton_ocr.py +403 -0
  771. transformers/models/lighton_ocr/processing_lighton_ocr.py +229 -0
  772. transformers/models/lilt/configuration_lilt.py +0 -1
  773. transformers/models/lilt/modeling_lilt.py +72 -70
  774. transformers/models/llama/configuration_llama.py +21 -24
  775. transformers/models/llama/modeling_llama.py +32 -35
  776. transformers/models/llama/tokenization_llama.py +2 -4
  777. transformers/models/llama4/configuration_llama4.py +20 -22
  778. transformers/models/llama4/image_processing_llama4_fast.py +9 -11
  779. transformers/models/llama4/modeling_llama4.py +78 -75
  780. transformers/models/llama4/processing_llama4.py +33 -57
  781. transformers/models/llava/configuration_llava.py +0 -1
  782. transformers/models/llava/image_processing_llava.py +25 -28
  783. transformers/models/llava/image_processing_llava_fast.py +6 -8
  784. transformers/models/llava/modeling_llava.py +47 -44
  785. transformers/models/llava/processing_llava.py +18 -51
  786. transformers/models/llava_next/configuration_llava_next.py +0 -1
  787. transformers/models/llava_next/image_processing_llava_next.py +43 -45
  788. transformers/models/llava_next/image_processing_llava_next_fast.py +5 -7
  789. transformers/models/llava_next/modeling_llava_next.py +49 -47
  790. transformers/models/llava_next/processing_llava_next.py +18 -47
  791. transformers/models/llava_next_video/configuration_llava_next_video.py +0 -1
  792. transformers/models/llava_next_video/modeling_llava_next_video.py +60 -58
  793. transformers/models/llava_next_video/modular_llava_next_video.py +51 -49
  794. transformers/models/llava_next_video/processing_llava_next_video.py +21 -63
  795. transformers/models/llava_next_video/video_processing_llava_next_video.py +0 -1
  796. transformers/models/llava_onevision/configuration_llava_onevision.py +0 -1
  797. transformers/models/llava_onevision/image_processing_llava_onevision.py +40 -42
  798. transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +6 -8
  799. transformers/models/llava_onevision/modeling_llava_onevision.py +67 -65
  800. transformers/models/llava_onevision/modular_llava_onevision.py +58 -56
  801. transformers/models/llava_onevision/processing_llava_onevision.py +21 -53
  802. transformers/models/llava_onevision/video_processing_llava_onevision.py +0 -1
  803. transformers/models/longcat_flash/__init__.py +0 -1
  804. transformers/models/longcat_flash/configuration_longcat_flash.py +32 -35
  805. transformers/models/longcat_flash/modeling_longcat_flash.py +32 -32
  806. transformers/models/longcat_flash/modular_longcat_flash.py +18 -19
  807. transformers/models/longformer/configuration_longformer.py +1 -4
  808. transformers/models/longformer/modeling_longformer.py +99 -101
  809. transformers/models/longt5/configuration_longt5.py +0 -1
  810. transformers/models/longt5/modeling_longt5.py +43 -48
  811. transformers/models/luke/configuration_luke.py +0 -1
  812. transformers/models/luke/modeling_luke.py +179 -181
  813. transformers/models/luke/tokenization_luke.py +99 -105
  814. transformers/models/lw_detr/__init__.py +27 -0
  815. transformers/models/lw_detr/configuration_lw_detr.py +374 -0
  816. transformers/models/lw_detr/modeling_lw_detr.py +1698 -0
  817. transformers/models/lw_detr/modular_lw_detr.py +1611 -0
  818. transformers/models/lxmert/configuration_lxmert.py +0 -1
  819. transformers/models/lxmert/modeling_lxmert.py +63 -74
  820. transformers/models/m2m_100/configuration_m2m_100.py +0 -1
  821. transformers/models/m2m_100/modeling_m2m_100.py +79 -71
  822. transformers/models/m2m_100/tokenization_m2m_100.py +8 -8
  823. transformers/models/mamba/configuration_mamba.py +0 -1
  824. transformers/models/mamba/modeling_mamba.py +44 -44
  825. transformers/models/mamba2/configuration_mamba2.py +0 -1
  826. transformers/models/mamba2/modeling_mamba2.py +67 -68
  827. transformers/models/marian/configuration_marian.py +1 -2
  828. transformers/models/marian/modeling_marian.py +87 -86
  829. transformers/models/marian/tokenization_marian.py +6 -6
  830. transformers/models/markuplm/configuration_markuplm.py +0 -1
  831. transformers/models/markuplm/feature_extraction_markuplm.py +1 -2
  832. transformers/models/markuplm/modeling_markuplm.py +65 -70
  833. transformers/models/markuplm/processing_markuplm.py +31 -38
  834. transformers/models/markuplm/tokenization_markuplm.py +67 -77
  835. transformers/models/mask2former/configuration_mask2former.py +5 -8
  836. transformers/models/mask2former/image_processing_mask2former.py +84 -85
  837. transformers/models/mask2former/image_processing_mask2former_fast.py +30 -33
  838. transformers/models/mask2former/modeling_mask2former.py +99 -92
  839. transformers/models/mask2former/modular_mask2former.py +6 -8
  840. transformers/models/maskformer/configuration_maskformer.py +6 -9
  841. transformers/models/maskformer/configuration_maskformer_swin.py +0 -1
  842. transformers/models/maskformer/image_processing_maskformer.py +84 -85
  843. transformers/models/maskformer/image_processing_maskformer_fast.py +29 -33
  844. transformers/models/maskformer/modeling_maskformer.py +65 -59
  845. transformers/models/maskformer/modeling_maskformer_swin.py +34 -32
  846. transformers/models/mbart/configuration_mbart.py +1 -1
  847. transformers/models/mbart/modeling_mbart.py +118 -113
  848. transformers/models/mbart/tokenization_mbart.py +2 -4
  849. transformers/models/mbart50/tokenization_mbart50.py +3 -5
  850. transformers/models/megatron_bert/configuration_megatron_bert.py +0 -1
  851. transformers/models/megatron_bert/modeling_megatron_bert.py +141 -150
  852. transformers/models/metaclip_2/modeling_metaclip_2.py +48 -46
  853. transformers/models/metaclip_2/modular_metaclip_2.py +21 -21
  854. transformers/models/mgp_str/configuration_mgp_str.py +0 -1
  855. transformers/models/mgp_str/modeling_mgp_str.py +14 -16
  856. transformers/models/mgp_str/processing_mgp_str.py +3 -20
  857. transformers/models/mgp_str/tokenization_mgp_str.py +1 -3
  858. transformers/models/mimi/configuration_mimi.py +38 -40
  859. transformers/models/mimi/modeling_mimi.py +100 -82
  860. transformers/models/minimax/__init__.py +0 -1
  861. transformers/models/minimax/configuration_minimax.py +32 -36
  862. transformers/models/minimax/modeling_minimax.py +57 -47
  863. transformers/models/minimax/modular_minimax.py +62 -54
  864. transformers/models/minimax_m2/__init__.py +28 -0
  865. transformers/models/minimax_m2/configuration_minimax_m2.py +211 -0
  866. transformers/models/minimax_m2/modeling_minimax_m2.py +704 -0
  867. transformers/models/minimax_m2/modular_minimax_m2.py +369 -0
  868. transformers/models/ministral/configuration_ministral.py +20 -22
  869. transformers/models/ministral/modeling_ministral.py +32 -34
  870. transformers/models/ministral/modular_ministral.py +27 -29
  871. transformers/models/ministral3/configuration_ministral3.py +19 -22
  872. transformers/models/ministral3/modeling_ministral3.py +32 -34
  873. transformers/models/ministral3/modular_ministral3.py +4 -5
  874. transformers/models/mistral/configuration_mistral.py +19 -22
  875. transformers/models/mistral/modeling_mistral.py +32 -34
  876. transformers/models/mistral/modular_mistral.py +11 -12
  877. transformers/models/mistral3/configuration_mistral3.py +0 -1
  878. transformers/models/mistral3/modeling_mistral3.py +53 -46
  879. transformers/models/mistral3/modular_mistral3.py +38 -36
  880. transformers/models/mixtral/configuration_mixtral.py +24 -27
  881. transformers/models/mixtral/modeling_mixtral.py +47 -42
  882. transformers/models/mixtral/modular_mixtral.py +32 -31
  883. transformers/models/mlcd/configuration_mlcd.py +0 -1
  884. transformers/models/mlcd/modeling_mlcd.py +16 -12
  885. transformers/models/mlcd/modular_mlcd.py +13 -11
  886. transformers/models/mllama/configuration_mllama.py +5 -8
  887. transformers/models/mllama/image_processing_mllama.py +23 -25
  888. transformers/models/mllama/image_processing_mllama_fast.py +5 -6
  889. transformers/models/mllama/modeling_mllama.py +94 -86
  890. transformers/models/mllama/processing_mllama.py +6 -55
  891. transformers/models/mluke/tokenization_mluke.py +97 -103
  892. transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +1 -3
  893. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +95 -97
  894. transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +1 -3
  895. transformers/models/mobilebert/configuration_mobilebert.py +0 -1
  896. transformers/models/mobilebert/modeling_mobilebert.py +77 -85
  897. transformers/models/mobilebert/tokenization_mobilebert.py +0 -1
  898. transformers/models/mobilenet_v1/configuration_mobilenet_v1.py +0 -1
  899. transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py +20 -23
  900. transformers/models/mobilenet_v1/image_processing_mobilenet_v1_fast.py +0 -1
  901. transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +13 -16
  902. transformers/models/mobilenet_v2/configuration_mobilenet_v2.py +0 -1
  903. transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py +48 -51
  904. transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +10 -12
  905. transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +17 -20
  906. transformers/models/mobilevit/configuration_mobilevit.py +0 -1
  907. transformers/models/mobilevit/image_processing_mobilevit.py +46 -49
  908. transformers/models/mobilevit/image_processing_mobilevit_fast.py +9 -11
  909. transformers/models/mobilevit/modeling_mobilevit.py +21 -19
  910. transformers/models/mobilevitv2/configuration_mobilevitv2.py +0 -1
  911. transformers/models/mobilevitv2/modeling_mobilevitv2.py +21 -20
  912. transformers/models/modernbert/configuration_modernbert.py +34 -34
  913. transformers/models/modernbert/modeling_modernbert.py +135 -126
  914. transformers/models/modernbert/modular_modernbert.py +167 -156
  915. transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +30 -32
  916. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +54 -48
  917. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +78 -71
  918. transformers/models/moonshine/configuration_moonshine.py +22 -24
  919. transformers/models/moonshine/modeling_moonshine.py +64 -66
  920. transformers/models/moonshine/modular_moonshine.py +72 -73
  921. transformers/models/moshi/configuration_moshi.py +18 -21
  922. transformers/models/moshi/modeling_moshi.py +150 -183
  923. transformers/models/mpnet/configuration_mpnet.py +0 -1
  924. transformers/models/mpnet/modeling_mpnet.py +57 -57
  925. transformers/models/mpnet/tokenization_mpnet.py +1 -4
  926. transformers/models/mpt/configuration_mpt.py +1 -9
  927. transformers/models/mpt/modeling_mpt.py +58 -60
  928. transformers/models/mra/configuration_mra.py +0 -1
  929. transformers/models/mra/modeling_mra.py +58 -57
  930. transformers/models/mt5/configuration_mt5.py +2 -4
  931. transformers/models/mt5/modeling_mt5.py +75 -87
  932. transformers/models/musicgen/configuration_musicgen.py +0 -1
  933. transformers/models/musicgen/modeling_musicgen.py +113 -120
  934. transformers/models/musicgen/processing_musicgen.py +3 -21
  935. transformers/models/musicgen_melody/configuration_musicgen_melody.py +0 -1
  936. transformers/models/musicgen_melody/feature_extraction_musicgen_melody.py +8 -9
  937. transformers/models/musicgen_melody/modeling_musicgen_melody.py +110 -109
  938. transformers/models/musicgen_melody/processing_musicgen_melody.py +3 -22
  939. transformers/models/mvp/configuration_mvp.py +0 -1
  940. transformers/models/mvp/modeling_mvp.py +122 -119
  941. transformers/models/myt5/tokenization_myt5.py +8 -10
  942. transformers/models/nanochat/configuration_nanochat.py +0 -1
  943. transformers/models/nanochat/modeling_nanochat.py +33 -36
  944. transformers/models/nanochat/modular_nanochat.py +12 -14
  945. transformers/models/nemotron/configuration_nemotron.py +20 -23
  946. transformers/models/nemotron/modeling_nemotron.py +51 -54
  947. transformers/models/nllb/tokenization_nllb.py +7 -9
  948. transformers/models/nllb_moe/configuration_nllb_moe.py +1 -1
  949. transformers/models/nllb_moe/modeling_nllb_moe.py +77 -69
  950. transformers/models/nougat/image_processing_nougat.py +29 -32
  951. transformers/models/nougat/image_processing_nougat_fast.py +4 -6
  952. transformers/models/nougat/processing_nougat.py +37 -39
  953. transformers/models/nougat/tokenization_nougat.py +16 -23
  954. transformers/models/nystromformer/configuration_nystromformer.py +0 -1
  955. transformers/models/nystromformer/modeling_nystromformer.py +68 -63
  956. transformers/models/olmo/configuration_olmo.py +18 -21
  957. transformers/models/olmo/modeling_olmo.py +32 -35
  958. transformers/models/olmo/modular_olmo.py +5 -9
  959. transformers/models/olmo2/configuration_olmo2.py +18 -21
  960. transformers/models/olmo2/modeling_olmo2.py +33 -36
  961. transformers/models/olmo2/modular_olmo2.py +29 -31
  962. transformers/models/olmo3/__init__.py +0 -1
  963. transformers/models/olmo3/configuration_olmo3.py +20 -23
  964. transformers/models/olmo3/modeling_olmo3.py +32 -35
  965. transformers/models/olmo3/modular_olmo3.py +31 -33
  966. transformers/models/olmoe/configuration_olmoe.py +24 -26
  967. transformers/models/olmoe/modeling_olmoe.py +49 -43
  968. transformers/models/olmoe/modular_olmoe.py +16 -15
  969. transformers/models/omdet_turbo/configuration_omdet_turbo.py +2 -3
  970. transformers/models/omdet_turbo/modeling_omdet_turbo.py +42 -40
  971. transformers/models/omdet_turbo/processing_omdet_turbo.py +19 -67
  972. transformers/models/oneformer/configuration_oneformer.py +5 -8
  973. transformers/models/oneformer/image_processing_oneformer.py +83 -84
  974. transformers/models/oneformer/image_processing_oneformer_fast.py +33 -34
  975. transformers/models/oneformer/modeling_oneformer.py +130 -162
  976. transformers/models/oneformer/processing_oneformer.py +28 -43
  977. transformers/models/openai/configuration_openai.py +0 -1
  978. transformers/models/openai/modeling_openai.py +62 -51
  979. transformers/models/openai/tokenization_openai.py +2 -5
  980. transformers/models/opt/configuration_opt.py +0 -1
  981. transformers/models/opt/modeling_opt.py +74 -75
  982. transformers/models/ovis2/__init__.py +0 -1
  983. transformers/models/ovis2/configuration_ovis2.py +0 -1
  984. transformers/models/ovis2/image_processing_ovis2.py +22 -24
  985. transformers/models/ovis2/image_processing_ovis2_fast.py +6 -8
  986. transformers/models/ovis2/modeling_ovis2.py +58 -48
  987. transformers/models/ovis2/modular_ovis2.py +38 -32
  988. transformers/models/ovis2/processing_ovis2.py +12 -40
  989. transformers/models/owlv2/configuration_owlv2.py +0 -1
  990. transformers/models/owlv2/image_processing_owlv2.py +20 -21
  991. transformers/models/owlv2/image_processing_owlv2_fast.py +7 -10
  992. transformers/models/owlv2/modeling_owlv2.py +89 -90
  993. transformers/models/owlv2/modular_owlv2.py +6 -9
  994. transformers/models/owlv2/processing_owlv2.py +20 -49
  995. transformers/models/owlvit/configuration_owlvit.py +0 -1
  996. transformers/models/owlvit/image_processing_owlvit.py +21 -22
  997. transformers/models/owlvit/image_processing_owlvit_fast.py +2 -3
  998. transformers/models/owlvit/modeling_owlvit.py +88 -89
  999. transformers/models/owlvit/processing_owlvit.py +20 -48
  1000. transformers/models/paddleocr_vl/__init__.py +0 -1
  1001. transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +19 -19
  1002. transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +37 -37
  1003. transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +12 -12
  1004. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +104 -90
  1005. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +90 -80
  1006. transformers/models/paddleocr_vl/processing_paddleocr_vl.py +1 -3
  1007. transformers/models/paligemma/configuration_paligemma.py +0 -1
  1008. transformers/models/paligemma/modeling_paligemma.py +73 -67
  1009. transformers/models/paligemma/processing_paligemma.py +13 -66
  1010. transformers/models/parakeet/configuration_parakeet.py +1 -4
  1011. transformers/models/parakeet/feature_extraction_parakeet.py +10 -12
  1012. transformers/models/parakeet/modeling_parakeet.py +23 -22
  1013. transformers/models/parakeet/modular_parakeet.py +21 -18
  1014. transformers/models/parakeet/processing_parakeet.py +12 -5
  1015. transformers/models/parakeet/{tokenization_parakeet_fast.py → tokenization_parakeet.py} +5 -7
  1016. transformers/models/patchtsmixer/configuration_patchtsmixer.py +5 -8
  1017. transformers/models/patchtsmixer/modeling_patchtsmixer.py +64 -62
  1018. transformers/models/patchtst/configuration_patchtst.py +6 -9
  1019. transformers/models/patchtst/modeling_patchtst.py +77 -78
  1020. transformers/models/pe_audio/__init__.py +29 -0
  1021. transformers/models/pe_audio/configuration_pe_audio.py +204 -0
  1022. transformers/models/pe_audio/feature_extraction_pe_audio.py +160 -0
  1023. transformers/models/pe_audio/modeling_pe_audio.py +819 -0
  1024. transformers/models/pe_audio/modular_pe_audio.py +298 -0
  1025. transformers/models/pe_audio/processing_pe_audio.py +23 -0
  1026. transformers/models/pe_audio_video/__init__.py +28 -0
  1027. transformers/models/pe_audio_video/configuration_pe_audio_video.py +223 -0
  1028. transformers/models/pe_audio_video/modeling_pe_audio_video.py +971 -0
  1029. transformers/models/pe_audio_video/modular_pe_audio_video.py +763 -0
  1030. transformers/models/pe_audio_video/processing_pe_audio_video.py +24 -0
  1031. transformers/models/pe_video/__init__.py +29 -0
  1032. transformers/models/pe_video/configuration_pe_video.py +209 -0
  1033. transformers/models/pe_video/modeling_pe_video.py +635 -0
  1034. transformers/models/pe_video/modular_pe_video.py +218 -0
  1035. transformers/models/pe_video/processing_pe_video.py +10 -0
  1036. transformers/models/pe_video/video_processing_pe_video.py +64 -0
  1037. transformers/models/pegasus/configuration_pegasus.py +1 -1
  1038. transformers/models/pegasus/modeling_pegasus.py +66 -65
  1039. transformers/models/pegasus/tokenization_pegasus.py +1 -4
  1040. transformers/models/pegasus_x/configuration_pegasus_x.py +0 -1
  1041. transformers/models/pegasus_x/modeling_pegasus_x.py +51 -52
  1042. transformers/models/perceiver/configuration_perceiver.py +0 -1
  1043. transformers/models/perceiver/image_processing_perceiver.py +22 -25
  1044. transformers/models/perceiver/image_processing_perceiver_fast.py +5 -7
  1045. transformers/models/perceiver/modeling_perceiver.py +140 -137
  1046. transformers/models/perceiver/tokenization_perceiver.py +3 -6
  1047. transformers/models/perception_lm/configuration_perception_lm.py +0 -1
  1048. transformers/models/perception_lm/image_processing_perception_lm_fast.py +8 -10
  1049. transformers/models/perception_lm/modeling_perception_lm.py +45 -43
  1050. transformers/models/perception_lm/modular_perception_lm.py +38 -36
  1051. transformers/models/perception_lm/processing_perception_lm.py +13 -47
  1052. transformers/models/perception_lm/video_processing_perception_lm.py +0 -1
  1053. transformers/models/persimmon/configuration_persimmon.py +18 -21
  1054. transformers/models/persimmon/modeling_persimmon.py +40 -43
  1055. transformers/models/phi/configuration_phi.py +19 -22
  1056. transformers/models/phi/modeling_phi.py +36 -38
  1057. transformers/models/phi/modular_phi.py +23 -23
  1058. transformers/models/phi3/configuration_phi3.py +23 -26
  1059. transformers/models/phi3/modeling_phi3.py +34 -37
  1060. transformers/models/phi3/modular_phi3.py +13 -17
  1061. transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +25 -26
  1062. transformers/models/phi4_multimodal/feature_extraction_phi4_multimodal.py +7 -9
  1063. transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py +7 -7
  1064. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +58 -57
  1065. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +62 -60
  1066. transformers/models/phi4_multimodal/processing_phi4_multimodal.py +7 -44
  1067. transformers/models/phimoe/configuration_phimoe.py +26 -29
  1068. transformers/models/phimoe/modeling_phimoe.py +47 -42
  1069. transformers/models/phimoe/modular_phimoe.py +1 -2
  1070. transformers/models/phobert/tokenization_phobert.py +4 -6
  1071. transformers/models/pix2struct/configuration_pix2struct.py +0 -1
  1072. transformers/models/pix2struct/image_processing_pix2struct.py +15 -19
  1073. transformers/models/pix2struct/image_processing_pix2struct_fast.py +7 -10
  1074. transformers/models/pix2struct/modeling_pix2struct.py +42 -45
  1075. transformers/models/pix2struct/processing_pix2struct.py +5 -30
  1076. transformers/models/pixio/__init__.py +29 -0
  1077. transformers/models/pixio/configuration_pixio.py +150 -0
  1078. transformers/models/pixio/modeling_pixio.py +505 -0
  1079. transformers/models/pixio/modular_pixio.py +401 -0
  1080. transformers/models/pixtral/configuration_pixtral.py +11 -14
  1081. transformers/models/pixtral/image_processing_pixtral.py +26 -28
  1082. transformers/models/pixtral/image_processing_pixtral_fast.py +5 -6
  1083. transformers/models/pixtral/modeling_pixtral.py +23 -26
  1084. transformers/models/pixtral/processing_pixtral.py +21 -53
  1085. transformers/models/plbart/configuration_plbart.py +1 -1
  1086. transformers/models/plbart/modeling_plbart.py +107 -102
  1087. transformers/models/plbart/modular_plbart.py +36 -32
  1088. transformers/models/plbart/tokenization_plbart.py +4 -5
  1089. transformers/models/poolformer/configuration_poolformer.py +0 -1
  1090. transformers/models/poolformer/image_processing_poolformer.py +21 -24
  1091. transformers/models/poolformer/image_processing_poolformer_fast.py +6 -8
  1092. transformers/models/poolformer/modeling_poolformer.py +21 -13
  1093. transformers/models/pop2piano/configuration_pop2piano.py +0 -2
  1094. transformers/models/pop2piano/feature_extraction_pop2piano.py +6 -9
  1095. transformers/models/pop2piano/modeling_pop2piano.py +22 -23
  1096. transformers/models/pop2piano/processing_pop2piano.py +25 -33
  1097. transformers/models/pop2piano/tokenization_pop2piano.py +15 -23
  1098. transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +3 -3
  1099. transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything.py +28 -28
  1100. transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything_fast.py +14 -15
  1101. transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +9 -10
  1102. transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +9 -10
  1103. transformers/models/prophetnet/configuration_prophetnet.py +26 -28
  1104. transformers/models/prophetnet/modeling_prophetnet.py +111 -131
  1105. transformers/models/prophetnet/tokenization_prophetnet.py +14 -16
  1106. transformers/models/pvt/configuration_pvt.py +0 -1
  1107. transformers/models/pvt/image_processing_pvt.py +17 -20
  1108. transformers/models/pvt/image_processing_pvt_fast.py +0 -1
  1109. transformers/models/pvt/modeling_pvt.py +19 -21
  1110. transformers/models/pvt_v2/configuration_pvt_v2.py +2 -4
  1111. transformers/models/pvt_v2/modeling_pvt_v2.py +21 -23
  1112. transformers/models/qwen2/configuration_qwen2.py +18 -21
  1113. transformers/models/qwen2/modeling_qwen2.py +32 -34
  1114. transformers/models/qwen2/modular_qwen2.py +11 -12
  1115. transformers/models/qwen2/tokenization_qwen2.py +2 -5
  1116. transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +20 -23
  1117. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +239 -192
  1118. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +174 -127
  1119. transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py +41 -49
  1120. transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +22 -25
  1121. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +112 -101
  1122. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +72 -107
  1123. transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py +7 -43
  1124. transformers/models/qwen2_audio/configuration_qwen2_audio.py +0 -1
  1125. transformers/models/qwen2_audio/modeling_qwen2_audio.py +29 -31
  1126. transformers/models/qwen2_audio/processing_qwen2_audio.py +13 -42
  1127. transformers/models/qwen2_moe/configuration_qwen2_moe.py +28 -31
  1128. transformers/models/qwen2_moe/modeling_qwen2_moe.py +48 -43
  1129. transformers/models/qwen2_moe/modular_qwen2_moe.py +7 -10
  1130. transformers/models/qwen2_vl/configuration_qwen2_vl.py +22 -24
  1131. transformers/models/qwen2_vl/image_processing_qwen2_vl.py +41 -42
  1132. transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py +8 -9
  1133. transformers/models/qwen2_vl/modeling_qwen2_vl.py +108 -96
  1134. transformers/models/qwen2_vl/processing_qwen2_vl.py +7 -44
  1135. transformers/models/qwen2_vl/video_processing_qwen2_vl.py +35 -13
  1136. transformers/models/qwen3/configuration_qwen3.py +20 -23
  1137. transformers/models/qwen3/modeling_qwen3.py +32 -35
  1138. transformers/models/qwen3/modular_qwen3.py +4 -6
  1139. transformers/models/qwen3_moe/configuration_qwen3_moe.py +25 -28
  1140. transformers/models/qwen3_moe/modeling_qwen3_moe.py +48 -43
  1141. transformers/models/qwen3_moe/modular_qwen3_moe.py +10 -13
  1142. transformers/models/qwen3_next/configuration_qwen3_next.py +31 -34
  1143. transformers/models/qwen3_next/modeling_qwen3_next.py +43 -48
  1144. transformers/models/qwen3_next/modular_qwen3_next.py +33 -34
  1145. transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +89 -88
  1146. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +199 -156
  1147. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +170 -152
  1148. transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py +40 -48
  1149. transformers/models/qwen3_vl/configuration_qwen3_vl.py +21 -24
  1150. transformers/models/qwen3_vl/modeling_qwen3_vl.py +91 -81
  1151. transformers/models/qwen3_vl/modular_qwen3_vl.py +86 -112
  1152. transformers/models/qwen3_vl/processing_qwen3_vl.py +6 -42
  1153. transformers/models/qwen3_vl/video_processing_qwen3_vl.py +10 -12
  1154. transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +21 -25
  1155. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +174 -195
  1156. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +65 -117
  1157. transformers/models/rag/configuration_rag.py +0 -9
  1158. transformers/models/rag/modeling_rag.py +123 -127
  1159. transformers/models/rag/retrieval_rag.py +2 -4
  1160. transformers/models/rag/tokenization_rag.py +0 -50
  1161. transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +21 -24
  1162. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +34 -36
  1163. transformers/models/reformer/configuration_reformer.py +0 -1
  1164. transformers/models/reformer/modeling_reformer.py +76 -69
  1165. transformers/models/reformer/tokenization_reformer.py +3 -6
  1166. transformers/models/regnet/configuration_regnet.py +0 -1
  1167. transformers/models/regnet/modeling_regnet.py +11 -9
  1168. transformers/models/rembert/configuration_rembert.py +0 -1
  1169. transformers/models/rembert/modeling_rembert.py +115 -111
  1170. transformers/models/rembert/tokenization_rembert.py +1 -4
  1171. transformers/models/resnet/configuration_resnet.py +0 -1
  1172. transformers/models/resnet/modeling_resnet.py +16 -13
  1173. transformers/models/roberta/configuration_roberta.py +0 -1
  1174. transformers/models/roberta/modeling_roberta.py +94 -93
  1175. transformers/models/roberta/modular_roberta.py +58 -58
  1176. transformers/models/roberta/tokenization_roberta.py +2 -5
  1177. transformers/models/roberta/tokenization_roberta_old.py +2 -4
  1178. transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py +0 -1
  1179. transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +94 -93
  1180. transformers/models/roc_bert/configuration_roc_bert.py +0 -1
  1181. transformers/models/roc_bert/modeling_roc_bert.py +122 -121
  1182. transformers/models/roc_bert/tokenization_roc_bert.py +88 -94
  1183. transformers/models/roformer/configuration_roformer.py +0 -1
  1184. transformers/models/roformer/modeling_roformer.py +79 -81
  1185. transformers/models/roformer/tokenization_roformer.py +3 -6
  1186. transformers/models/roformer/tokenization_utils.py +0 -1
  1187. transformers/models/rt_detr/configuration_rt_detr.py +1 -2
  1188. transformers/models/rt_detr/configuration_rt_detr_resnet.py +0 -1
  1189. transformers/models/rt_detr/image_processing_rt_detr.py +54 -55
  1190. transformers/models/rt_detr/image_processing_rt_detr_fast.py +15 -15
  1191. transformers/models/rt_detr/modeling_rt_detr.py +84 -82
  1192. transformers/models/rt_detr/modeling_rt_detr_resnet.py +10 -7
  1193. transformers/models/rt_detr/modular_rt_detr.py +14 -14
  1194. transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +2 -4
  1195. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +86 -81
  1196. transformers/models/rt_detr_v2/modular_rt_detr_v2.py +10 -7
  1197. transformers/models/rwkv/configuration_rwkv.py +0 -1
  1198. transformers/models/rwkv/modeling_rwkv.py +30 -32
  1199. transformers/models/sam/configuration_sam.py +1 -1
  1200. transformers/models/sam/image_processing_sam.py +59 -60
  1201. transformers/models/sam/image_processing_sam_fast.py +21 -23
  1202. transformers/models/sam/modeling_sam.py +37 -36
  1203. transformers/models/sam/processing_sam.py +39 -27
  1204. transformers/models/sam2/configuration_sam2.py +1 -2
  1205. transformers/models/sam2/image_processing_sam2_fast.py +14 -15
  1206. transformers/models/sam2/modeling_sam2.py +50 -48
  1207. transformers/models/sam2/modular_sam2.py +48 -45
  1208. transformers/models/sam2/processing_sam2.py +31 -47
  1209. transformers/models/sam2_video/configuration_sam2_video.py +0 -1
  1210. transformers/models/sam2_video/modeling_sam2_video.py +119 -112
  1211. transformers/models/sam2_video/modular_sam2_video.py +91 -97
  1212. transformers/models/sam2_video/processing_sam2_video.py +49 -66
  1213. transformers/models/sam2_video/video_processing_sam2_video.py +1 -4
  1214. transformers/models/sam3/configuration_sam3.py +21 -2
  1215. transformers/models/sam3/image_processing_sam3_fast.py +17 -20
  1216. transformers/models/sam3/modeling_sam3.py +77 -56
  1217. transformers/models/sam3/modular_sam3.py +3 -8
  1218. transformers/models/sam3/processing_sam3.py +29 -48
  1219. transformers/models/sam3_tracker/__init__.py +0 -1
  1220. transformers/models/sam3_tracker/configuration_sam3_tracker.py +0 -1
  1221. transformers/models/sam3_tracker/modeling_sam3_tracker.py +36 -36
  1222. transformers/models/sam3_tracker/modular_sam3_tracker.py +2 -1
  1223. transformers/models/sam3_tracker/processing_sam3_tracker.py +31 -47
  1224. transformers/models/sam3_tracker_video/__init__.py +0 -1
  1225. transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +25 -1
  1226. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +96 -85
  1227. transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +27 -6
  1228. transformers/models/sam3_tracker_video/processing_sam3_tracker_video.py +50 -66
  1229. transformers/models/sam3_video/configuration_sam3_video.py +14 -1
  1230. transformers/models/sam3_video/modeling_sam3_video.py +32 -34
  1231. transformers/models/sam3_video/processing_sam3_video.py +26 -46
  1232. transformers/models/sam_hq/__init__.py +1 -1
  1233. transformers/models/sam_hq/configuration_sam_hq.py +1 -1
  1234. transformers/models/sam_hq/modeling_sam_hq.py +65 -64
  1235. transformers/models/sam_hq/modular_sam_hq.py +17 -19
  1236. transformers/models/sam_hq/{processing_samhq.py → processing_sam_hq.py} +39 -28
  1237. transformers/models/seamless_m4t/configuration_seamless_m4t.py +0 -1
  1238. transformers/models/seamless_m4t/feature_extraction_seamless_m4t.py +8 -11
  1239. transformers/models/seamless_m4t/modeling_seamless_m4t.py +207 -193
  1240. transformers/models/seamless_m4t/processing_seamless_m4t.py +18 -39
  1241. transformers/models/seamless_m4t/tokenization_seamless_m4t.py +15 -20
  1242. transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py +0 -1
  1243. transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +199 -195
  1244. transformers/models/seed_oss/configuration_seed_oss.py +23 -25
  1245. transformers/models/seed_oss/modeling_seed_oss.py +31 -33
  1246. transformers/models/seed_oss/modular_seed_oss.py +3 -4
  1247. transformers/models/segformer/configuration_segformer.py +0 -10
  1248. transformers/models/segformer/image_processing_segformer.py +39 -42
  1249. transformers/models/segformer/image_processing_segformer_fast.py +7 -9
  1250. transformers/models/segformer/modeling_segformer.py +26 -28
  1251. transformers/models/segformer/modular_segformer.py +5 -7
  1252. transformers/models/seggpt/configuration_seggpt.py +0 -1
  1253. transformers/models/seggpt/image_processing_seggpt.py +38 -41
  1254. transformers/models/seggpt/modeling_seggpt.py +28 -30
  1255. transformers/models/sew/configuration_sew.py +0 -1
  1256. transformers/models/sew/modeling_sew.py +33 -35
  1257. transformers/models/sew/modular_sew.py +10 -12
  1258. transformers/models/sew_d/configuration_sew_d.py +0 -1
  1259. transformers/models/sew_d/modeling_sew_d.py +28 -30
  1260. transformers/models/shieldgemma2/configuration_shieldgemma2.py +0 -1
  1261. transformers/models/shieldgemma2/modeling_shieldgemma2.py +16 -17
  1262. transformers/models/shieldgemma2/processing_shieldgemma2.py +3 -5
  1263. transformers/models/siglip/configuration_siglip.py +0 -1
  1264. transformers/models/siglip/image_processing_siglip.py +17 -20
  1265. transformers/models/siglip/image_processing_siglip_fast.py +0 -1
  1266. transformers/models/siglip/modeling_siglip.py +62 -41
  1267. transformers/models/siglip/processing_siglip.py +2 -14
  1268. transformers/models/siglip/tokenization_siglip.py +6 -7
  1269. transformers/models/siglip2/configuration_siglip2.py +1 -1
  1270. transformers/models/siglip2/image_processing_siglip2.py +15 -16
  1271. transformers/models/siglip2/image_processing_siglip2_fast.py +4 -5
  1272. transformers/models/siglip2/modeling_siglip2.py +114 -92
  1273. transformers/models/siglip2/modular_siglip2.py +23 -25
  1274. transformers/models/siglip2/processing_siglip2.py +2 -14
  1275. transformers/models/smollm3/configuration_smollm3.py +23 -26
  1276. transformers/models/smollm3/modeling_smollm3.py +32 -35
  1277. transformers/models/smollm3/modular_smollm3.py +27 -29
  1278. transformers/models/smolvlm/configuration_smolvlm.py +1 -1
  1279. transformers/models/smolvlm/image_processing_smolvlm.py +42 -43
  1280. transformers/models/smolvlm/image_processing_smolvlm_fast.py +12 -12
  1281. transformers/models/smolvlm/modeling_smolvlm.py +56 -53
  1282. transformers/models/smolvlm/modular_smolvlm.py +15 -17
  1283. transformers/models/smolvlm/processing_smolvlm.py +15 -76
  1284. transformers/models/smolvlm/video_processing_smolvlm.py +7 -9
  1285. transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py +0 -1
  1286. transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +20 -23
  1287. transformers/models/speech_to_text/configuration_speech_to_text.py +0 -1
  1288. transformers/models/speech_to_text/feature_extraction_speech_to_text.py +10 -13
  1289. transformers/models/speech_to_text/modeling_speech_to_text.py +62 -54
  1290. transformers/models/speech_to_text/processing_speech_to_text.py +4 -30
  1291. transformers/models/speech_to_text/tokenization_speech_to_text.py +5 -6
  1292. transformers/models/speecht5/configuration_speecht5.py +0 -1
  1293. transformers/models/speecht5/feature_extraction_speecht5.py +16 -37
  1294. transformers/models/speecht5/modeling_speecht5.py +200 -174
  1295. transformers/models/speecht5/number_normalizer.py +0 -1
  1296. transformers/models/speecht5/processing_speecht5.py +3 -37
  1297. transformers/models/speecht5/tokenization_speecht5.py +4 -5
  1298. transformers/models/splinter/configuration_splinter.py +0 -1
  1299. transformers/models/splinter/modeling_splinter.py +63 -59
  1300. transformers/models/splinter/tokenization_splinter.py +2 -4
  1301. transformers/models/squeezebert/configuration_squeezebert.py +0 -1
  1302. transformers/models/squeezebert/modeling_squeezebert.py +62 -62
  1303. transformers/models/squeezebert/tokenization_squeezebert.py +0 -1
  1304. transformers/models/stablelm/configuration_stablelm.py +20 -23
  1305. transformers/models/stablelm/modeling_stablelm.py +40 -43
  1306. transformers/models/starcoder2/configuration_starcoder2.py +19 -22
  1307. transformers/models/starcoder2/modeling_starcoder2.py +34 -37
  1308. transformers/models/starcoder2/modular_starcoder2.py +13 -15
  1309. transformers/models/superglue/configuration_superglue.py +3 -3
  1310. transformers/models/superglue/image_processing_superglue.py +15 -15
  1311. transformers/models/superglue/image_processing_superglue_fast.py +5 -7
  1312. transformers/models/superglue/modeling_superglue.py +32 -33
  1313. transformers/models/superpoint/image_processing_superpoint.py +15 -15
  1314. transformers/models/superpoint/image_processing_superpoint_fast.py +5 -7
  1315. transformers/models/superpoint/modeling_superpoint.py +13 -14
  1316. transformers/models/swiftformer/configuration_swiftformer.py +0 -1
  1317. transformers/models/swiftformer/modeling_swiftformer.py +16 -14
  1318. transformers/models/swin/configuration_swin.py +0 -1
  1319. transformers/models/swin/modeling_swin.py +74 -82
  1320. transformers/models/swin2sr/configuration_swin2sr.py +0 -1
  1321. transformers/models/swin2sr/image_processing_swin2sr.py +10 -13
  1322. transformers/models/swin2sr/image_processing_swin2sr_fast.py +2 -6
  1323. transformers/models/swin2sr/modeling_swin2sr.py +75 -61
  1324. transformers/models/swinv2/configuration_swinv2.py +0 -1
  1325. transformers/models/swinv2/modeling_swinv2.py +96 -100
  1326. transformers/models/switch_transformers/configuration_switch_transformers.py +0 -1
  1327. transformers/models/switch_transformers/modeling_switch_transformers.py +34 -41
  1328. transformers/models/switch_transformers/modular_switch_transformers.py +31 -38
  1329. transformers/models/t5/configuration_t5.py +7 -2
  1330. transformers/models/t5/modeling_t5.py +76 -84
  1331. transformers/models/t5/tokenization_t5.py +1 -3
  1332. transformers/models/t5gemma/configuration_t5gemma.py +33 -34
  1333. transformers/models/t5gemma/modeling_t5gemma.py +97 -100
  1334. transformers/models/t5gemma/modular_t5gemma.py +117 -118
  1335. transformers/models/t5gemma2/configuration_t5gemma2.py +59 -96
  1336. transformers/models/t5gemma2/modeling_t5gemma2.py +109 -103
  1337. transformers/models/t5gemma2/modular_t5gemma2.py +375 -91
  1338. transformers/models/table_transformer/configuration_table_transformer.py +1 -2
  1339. transformers/models/table_transformer/modeling_table_transformer.py +47 -49
  1340. transformers/models/tapas/configuration_tapas.py +0 -1
  1341. transformers/models/tapas/modeling_tapas.py +64 -66
  1342. transformers/models/tapas/tokenization_tapas.py +115 -153
  1343. transformers/models/textnet/configuration_textnet.py +0 -1
  1344. transformers/models/textnet/image_processing_textnet.py +22 -25
  1345. transformers/models/textnet/image_processing_textnet_fast.py +5 -7
  1346. transformers/models/textnet/modeling_textnet.py +13 -14
  1347. transformers/models/time_series_transformer/configuration_time_series_transformer.py +5 -8
  1348. transformers/models/time_series_transformer/modeling_time_series_transformer.py +79 -81
  1349. transformers/models/timesfm/configuration_timesfm.py +0 -1
  1350. transformers/models/timesfm/modeling_timesfm.py +29 -19
  1351. transformers/models/timesfm/modular_timesfm.py +28 -18
  1352. transformers/models/timesformer/configuration_timesformer.py +0 -1
  1353. transformers/models/timesformer/modeling_timesformer.py +13 -16
  1354. transformers/models/timm_backbone/configuration_timm_backbone.py +0 -1
  1355. transformers/models/timm_backbone/modeling_timm_backbone.py +17 -15
  1356. transformers/models/timm_wrapper/configuration_timm_wrapper.py +5 -3
  1357. transformers/models/timm_wrapper/image_processing_timm_wrapper.py +4 -5
  1358. transformers/models/timm_wrapper/modeling_timm_wrapper.py +32 -28
  1359. transformers/models/trocr/configuration_trocr.py +0 -1
  1360. transformers/models/trocr/modeling_trocr.py +39 -42
  1361. transformers/models/trocr/processing_trocr.py +5 -25
  1362. transformers/models/tvp/configuration_tvp.py +5 -2
  1363. transformers/models/tvp/image_processing_tvp.py +50 -52
  1364. transformers/models/tvp/image_processing_tvp_fast.py +9 -10
  1365. transformers/models/tvp/modeling_tvp.py +25 -27
  1366. transformers/models/tvp/processing_tvp.py +2 -14
  1367. transformers/models/udop/configuration_udop.py +1 -1
  1368. transformers/models/udop/modeling_udop.py +63 -70
  1369. transformers/models/udop/processing_udop.py +7 -26
  1370. transformers/models/udop/tokenization_udop.py +80 -93
  1371. transformers/models/umt5/configuration_umt5.py +2 -3
  1372. transformers/models/umt5/modeling_umt5.py +80 -87
  1373. transformers/models/unispeech/configuration_unispeech.py +0 -1
  1374. transformers/models/unispeech/modeling_unispeech.py +47 -49
  1375. transformers/models/unispeech/modular_unispeech.py +20 -22
  1376. transformers/models/unispeech_sat/configuration_unispeech_sat.py +0 -1
  1377. transformers/models/unispeech_sat/modeling_unispeech_sat.py +63 -65
  1378. transformers/models/unispeech_sat/modular_unispeech_sat.py +21 -23
  1379. transformers/models/univnet/feature_extraction_univnet.py +14 -14
  1380. transformers/models/univnet/modeling_univnet.py +7 -8
  1381. transformers/models/upernet/configuration_upernet.py +0 -1
  1382. transformers/models/upernet/modeling_upernet.py +10 -13
  1383. transformers/models/vaultgemma/__init__.py +0 -1
  1384. transformers/models/vaultgemma/configuration_vaultgemma.py +24 -26
  1385. transformers/models/vaultgemma/modeling_vaultgemma.py +35 -37
  1386. transformers/models/vaultgemma/modular_vaultgemma.py +29 -31
  1387. transformers/models/video_llama_3/image_processing_video_llama_3.py +43 -42
  1388. transformers/models/video_llama_3/image_processing_video_llama_3_fast.py +8 -8
  1389. transformers/models/video_llama_3/modeling_video_llama_3.py +77 -66
  1390. transformers/models/video_llama_3/modular_video_llama_3.py +110 -112
  1391. transformers/models/video_llama_3/processing_video_llama_3.py +5 -39
  1392. transformers/models/video_llama_3/video_processing_video_llama_3.py +18 -18
  1393. transformers/models/video_llava/configuration_video_llava.py +0 -1
  1394. transformers/models/video_llava/image_processing_video_llava.py +35 -38
  1395. transformers/models/video_llava/modeling_video_llava.py +59 -57
  1396. transformers/models/video_llava/processing_video_llava.py +38 -78
  1397. transformers/models/video_llava/video_processing_video_llava.py +0 -1
  1398. transformers/models/videomae/configuration_videomae.py +0 -1
  1399. transformers/models/videomae/image_processing_videomae.py +31 -34
  1400. transformers/models/videomae/modeling_videomae.py +13 -15
  1401. transformers/models/videomae/video_processing_videomae.py +0 -1
  1402. transformers/models/vilt/configuration_vilt.py +2 -3
  1403. transformers/models/vilt/image_processing_vilt.py +29 -30
  1404. transformers/models/vilt/image_processing_vilt_fast.py +9 -10
  1405. transformers/models/vilt/modeling_vilt.py +83 -78
  1406. transformers/models/vilt/processing_vilt.py +2 -14
  1407. transformers/models/vipllava/configuration_vipllava.py +0 -1
  1408. transformers/models/vipllava/modeling_vipllava.py +45 -42
  1409. transformers/models/vipllava/modular_vipllava.py +30 -32
  1410. transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py +0 -1
  1411. transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +18 -21
  1412. transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py +0 -1
  1413. transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +18 -21
  1414. transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py +2 -16
  1415. transformers/models/visual_bert/configuration_visual_bert.py +0 -1
  1416. transformers/models/visual_bert/modeling_visual_bert.py +92 -92
  1417. transformers/models/vit/configuration_vit.py +0 -1
  1418. transformers/models/vit/image_processing_vit.py +19 -22
  1419. transformers/models/vit/image_processing_vit_fast.py +0 -1
  1420. transformers/models/vit/modeling_vit.py +13 -15
  1421. transformers/models/vit_mae/configuration_vit_mae.py +0 -1
  1422. transformers/models/vit_mae/modeling_vit_mae.py +21 -23
  1423. transformers/models/vit_msn/configuration_vit_msn.py +0 -1
  1424. transformers/models/vit_msn/modeling_vit_msn.py +10 -12
  1425. transformers/models/vitdet/configuration_vitdet.py +0 -1
  1426. transformers/models/vitdet/modeling_vitdet.py +12 -14
  1427. transformers/models/vitmatte/configuration_vitmatte.py +2 -5
  1428. transformers/models/vitmatte/image_processing_vitmatte.py +15 -18
  1429. transformers/models/vitmatte/image_processing_vitmatte_fast.py +14 -16
  1430. transformers/models/vitmatte/modeling_vitmatte.py +13 -11
  1431. transformers/models/vitpose/configuration_vitpose.py +4 -7
  1432. transformers/models/vitpose/image_processing_vitpose.py +24 -25
  1433. transformers/models/vitpose/image_processing_vitpose_fast.py +9 -11
  1434. transformers/models/vitpose/modeling_vitpose.py +10 -12
  1435. transformers/models/vitpose_backbone/configuration_vitpose_backbone.py +0 -1
  1436. transformers/models/vitpose_backbone/modeling_vitpose_backbone.py +8 -10
  1437. transformers/models/vits/configuration_vits.py +0 -1
  1438. transformers/models/vits/modeling_vits.py +34 -35
  1439. transformers/models/vits/tokenization_vits.py +3 -4
  1440. transformers/models/vivit/configuration_vivit.py +0 -1
  1441. transformers/models/vivit/image_processing_vivit.py +36 -39
  1442. transformers/models/vivit/modeling_vivit.py +5 -7
  1443. transformers/models/vjepa2/__init__.py +0 -1
  1444. transformers/models/vjepa2/configuration_vjepa2.py +0 -1
  1445. transformers/models/vjepa2/modeling_vjepa2.py +30 -32
  1446. transformers/models/vjepa2/video_processing_vjepa2.py +0 -1
  1447. transformers/models/voxtral/__init__.py +0 -1
  1448. transformers/models/voxtral/configuration_voxtral.py +0 -1
  1449. transformers/models/voxtral/modeling_voxtral.py +19 -27
  1450. transformers/models/voxtral/modular_voxtral.py +12 -21
  1451. transformers/models/voxtral/processing_voxtral.py +25 -48
  1452. transformers/models/wav2vec2/configuration_wav2vec2.py +0 -1
  1453. transformers/models/wav2vec2/feature_extraction_wav2vec2.py +7 -10
  1454. transformers/models/wav2vec2/modeling_wav2vec2.py +67 -122
  1455. transformers/models/wav2vec2/processing_wav2vec2.py +6 -35
  1456. transformers/models/wav2vec2/tokenization_wav2vec2.py +20 -332
  1457. transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py +0 -1
  1458. transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +65 -62
  1459. transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +52 -48
  1460. transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py +6 -35
  1461. transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py +0 -1
  1462. transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +84 -77
  1463. transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +37 -30
  1464. transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py +16 -17
  1465. transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py +36 -55
  1466. transformers/models/wavlm/configuration_wavlm.py +0 -1
  1467. transformers/models/wavlm/modeling_wavlm.py +45 -48
  1468. transformers/models/wavlm/modular_wavlm.py +4 -5
  1469. transformers/models/whisper/configuration_whisper.py +0 -1
  1470. transformers/models/whisper/english_normalizer.py +3 -4
  1471. transformers/models/whisper/feature_extraction_whisper.py +9 -24
  1472. transformers/models/whisper/generation_whisper.py +27 -48
  1473. transformers/models/whisper/modeling_whisper.py +73 -73
  1474. transformers/models/whisper/processing_whisper.py +3 -20
  1475. transformers/models/whisper/tokenization_whisper.py +9 -30
  1476. transformers/models/x_clip/configuration_x_clip.py +0 -1
  1477. transformers/models/x_clip/modeling_x_clip.py +70 -69
  1478. transformers/models/x_clip/processing_x_clip.py +2 -14
  1479. transformers/models/xcodec/configuration_xcodec.py +4 -6
  1480. transformers/models/xcodec/modeling_xcodec.py +20 -17
  1481. transformers/models/xglm/configuration_xglm.py +0 -1
  1482. transformers/models/xglm/modeling_xglm.py +59 -55
  1483. transformers/models/xglm/tokenization_xglm.py +1 -4
  1484. transformers/models/xlm/configuration_xlm.py +0 -1
  1485. transformers/models/xlm/modeling_xlm.py +139 -144
  1486. transformers/models/xlm/tokenization_xlm.py +3 -5
  1487. transformers/models/xlm_roberta/configuration_xlm_roberta.py +0 -1
  1488. transformers/models/xlm_roberta/modeling_xlm_roberta.py +195 -194
  1489. transformers/models/xlm_roberta/modular_xlm_roberta.py +50 -53
  1490. transformers/models/xlm_roberta/tokenization_xlm_roberta.py +1 -4
  1491. transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py +0 -1
  1492. transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +94 -93
  1493. transformers/models/xlm_roberta_xl/modular_xlm_roberta_xl.py +67 -70
  1494. transformers/models/xlnet/configuration_xlnet.py +0 -11
  1495. transformers/models/xlnet/modeling_xlnet.py +152 -163
  1496. transformers/models/xlnet/tokenization_xlnet.py +1 -4
  1497. transformers/models/xlstm/configuration_xlstm.py +3 -5
  1498. transformers/models/xlstm/modeling_xlstm.py +62 -65
  1499. transformers/models/xmod/configuration_xmod.py +0 -1
  1500. transformers/models/xmod/modeling_xmod.py +101 -100
  1501. transformers/models/yolos/configuration_yolos.py +0 -1
  1502. transformers/models/yolos/image_processing_yolos.py +60 -62
  1503. transformers/models/yolos/image_processing_yolos_fast.py +18 -18
  1504. transformers/models/yolos/modeling_yolos.py +12 -14
  1505. transformers/models/yolos/modular_yolos.py +2 -4
  1506. transformers/models/yoso/configuration_yoso.py +0 -1
  1507. transformers/models/yoso/modeling_yoso.py +64 -63
  1508. transformers/models/zamba/configuration_zamba.py +0 -1
  1509. transformers/models/zamba/modeling_zamba.py +70 -70
  1510. transformers/models/zamba2/configuration_zamba2.py +36 -37
  1511. transformers/models/zamba2/modeling_zamba2.py +87 -89
  1512. transformers/models/zamba2/modular_zamba2.py +43 -45
  1513. transformers/models/zoedepth/configuration_zoedepth.py +1 -2
  1514. transformers/models/zoedepth/image_processing_zoedepth.py +28 -29
  1515. transformers/models/zoedepth/image_processing_zoedepth_fast.py +12 -15
  1516. transformers/models/zoedepth/modeling_zoedepth.py +21 -16
  1517. transformers/pipelines/__init__.py +59 -55
  1518. transformers/pipelines/any_to_any.py +14 -22
  1519. transformers/pipelines/audio_utils.py +1 -2
  1520. transformers/pipelines/automatic_speech_recognition.py +20 -12
  1521. transformers/pipelines/base.py +13 -17
  1522. transformers/pipelines/deprecated/__init__.py +0 -1
  1523. transformers/pipelines/document_question_answering.py +1 -1
  1524. transformers/pipelines/image_text_to_text.py +0 -1
  1525. transformers/pipelines/image_to_text.py +4 -44
  1526. transformers/pipelines/question_answering.py +5 -44
  1527. transformers/pipelines/text_classification.py +1 -14
  1528. transformers/pipelines/text_to_audio.py +2 -2
  1529. transformers/pipelines/token_classification.py +1 -22
  1530. transformers/pipelines/video_classification.py +1 -9
  1531. transformers/pipelines/zero_shot_audio_classification.py +0 -1
  1532. transformers/pipelines/zero_shot_classification.py +0 -6
  1533. transformers/pipelines/zero_shot_image_classification.py +0 -7
  1534. transformers/processing_utils.py +222 -151
  1535. transformers/quantizers/auto.py +2 -4
  1536. transformers/quantizers/base.py +19 -64
  1537. transformers/quantizers/quantizer_aqlm.py +1 -18
  1538. transformers/quantizers/quantizer_auto_round.py +1 -10
  1539. transformers/quantizers/quantizer_awq.py +3 -8
  1540. transformers/quantizers/quantizer_bitnet.py +1 -6
  1541. transformers/quantizers/quantizer_bnb_4bit.py +9 -49
  1542. transformers/quantizers/quantizer_bnb_8bit.py +9 -19
  1543. transformers/quantizers/quantizer_compressed_tensors.py +1 -4
  1544. transformers/quantizers/quantizer_eetq.py +2 -12
  1545. transformers/quantizers/quantizer_fbgemm_fp8.py +5 -14
  1546. transformers/quantizers/quantizer_finegrained_fp8.py +15 -10
  1547. transformers/quantizers/quantizer_fp_quant.py +4 -4
  1548. transformers/quantizers/quantizer_gptq.py +1 -4
  1549. transformers/quantizers/quantizer_higgs.py +2 -6
  1550. transformers/quantizers/quantizer_mxfp4.py +2 -28
  1551. transformers/quantizers/quantizer_quanto.py +14 -14
  1552. transformers/quantizers/quantizer_quark.py +0 -1
  1553. transformers/quantizers/quantizer_spqr.py +3 -8
  1554. transformers/quantizers/quantizer_torchao.py +31 -127
  1555. transformers/quantizers/quantizer_vptq.py +1 -10
  1556. transformers/testing_utils.py +31 -49
  1557. transformers/tokenization_mistral_common.py +554 -902
  1558. transformers/tokenization_utils_base.py +112 -124
  1559. transformers/tokenization_utils_sentencepiece.py +5 -6
  1560. transformers/tokenization_utils_tokenizers.py +30 -7
  1561. transformers/trainer.py +30 -11
  1562. transformers/trainer_callback.py +8 -0
  1563. transformers/trainer_jit_checkpoint.py +1 -2
  1564. transformers/trainer_seq2seq.py +4 -0
  1565. transformers/training_args.py +11 -13
  1566. transformers/utils/__init__.py +4 -0
  1567. transformers/utils/attention_visualizer.py +5 -5
  1568. transformers/utils/auto_docstring.py +598 -37
  1569. transformers/utils/doc.py +1 -1
  1570. transformers/utils/dummy_pt_objects.py +0 -42
  1571. transformers/utils/generic.py +21 -1
  1572. transformers/utils/import_utils.py +51 -9
  1573. transformers/utils/kernel_config.py +71 -18
  1574. transformers/utils/loading_report.py +3 -3
  1575. transformers/utils/quantization_config.py +16 -18
  1576. transformers/video_processing_utils.py +35 -32
  1577. transformers/video_utils.py +18 -22
  1578. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc3.dist-info}/METADATA +23 -24
  1579. transformers-5.0.0rc3.dist-info/RECORD +2067 -0
  1580. transformers-5.0.0rc1.dist-info/RECORD +0 -2003
  1581. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc3.dist-info}/WHEEL +0 -0
  1582. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc3.dist-info}/entry_points.txt +0 -0
  1583. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc3.dist-info}/licenses/LICENSE +0 -0
  1584. {transformers-5.0.0rc1.dist-info → transformers-5.0.0rc3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1480 @@
1
+ # Copyright 2025 the HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import math
16
+ from collections.abc import Callable
17
+ from typing import Any
18
+
19
+ import numpy as np
20
+ import torch.nn as nn
21
+ import torch.nn.functional as F
22
+
23
+ from ...cache_utils import Cache
24
+ from ...configuration_utils import PreTrainedConfig
25
+ from ...feature_extraction_utils import BatchFeature
26
+ from ...generation import GenerationMixin
27
+ from ...image_utils import ImageInput
28
+ from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
29
+ from ...processing_utils import ImagesKwargs, ProcessorMixin, Unpack
30
+ from ...tokenization_utils_base import PreTokenizedInput, TextInput
31
+ from ...utils import TransformersKwargs, is_torch_available, logging
32
+ from ..chameleon.modeling_chameleon import ChameleonVQVAE, ChameleonVQVAEVectorQuantizer
33
+ from ..glm4v.configuration_glm4v import Glm4vTextConfig, Glm4vVisionConfig
34
+ from ..glm4v.modeling_glm4v import (
35
+ Glm4vCausalLMOutputWithPast,
36
+ Glm4vModel,
37
+ Glm4vModelOutputWithPast,
38
+ Glm4vPreTrainedModel,
39
+ Glm4vTextModel,
40
+ Glm4vVisionAttention,
41
+ Glm4vVisionBlock,
42
+ Glm4vVisionEmbeddings,
43
+ Glm4vVisionModel,
44
+ Glm4vVisionPatchEmbed,
45
+ )
46
+ from ..glm4v_moe.modeling_glm4v_moe import Glm4vMoeTextAttention, eager_attention_forward
47
+ from ..qwen2_vl.image_processing_qwen2_vl import Qwen2VLImageProcessor
48
+ from ..qwen2_vl.image_processing_qwen2_vl_fast import Qwen2VLImageProcessorFast
49
+ from ..qwen2_vl.processing_qwen2_vl import Qwen2VLProcessorKwargs
50
+ from ..siglip.modeling_siglip import SiglipMLP
51
+
52
+
53
+ if is_torch_available():
54
+ import torch
55
+
56
+ logger = logging.get_logger(__name__)
57
+
58
+
59
+ class GlmImageVQVAEConfig(PreTrainedConfig):
60
+ r"""
61
+ This is the configuration class to store the configuration of a [`GlmImageVQModel`]. It is used to instantiate a
62
+ `GlmImageVQModel` according to the specified arguments, defining the model architecture.
63
+ Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
64
+ documentation from [`PreTrainedConfig`] for more information. Instantiating a
65
+ configuration with the defaults will yield a similar configuration to the VQModel of the
66
+ [zai-org/GLM-Image](https://huggingface.co/zai-org/GLM-Image) architecture.
67
+
68
+ Args:
69
+ embed_dim (`int`, *optional*, defaults to 2048):
70
+ Dimensionality of each embedding vector.
71
+ num_embeddings (`int`, *optional*, defaults to 16384):
72
+ Number of codebook embeddings.
73
+ latent_channels (`int`, *optional*, defaults to 1536):
74
+ Number of channels for the latent space.
75
+ in_channels (`int`, *optional*, defaults to 3):
76
+ Number of input channels.
77
+ initializer_range (`float`, *optional*, defaults to 0.02):
78
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
79
+ """
80
+
81
+ model_type = "glm_image_vqmodel"
82
+ base_config_key = "vq_config"
83
+
84
+ def __init__(
85
+ self,
86
+ embed_dim: int = 2048,
87
+ num_embeddings: int = 16384,
88
+ latent_channels: int = 1536,
89
+ in_channels: int = 3,
90
+ initializer_range=0.02,
91
+ **kwargs,
92
+ ):
93
+ super().__init__(**kwargs)
94
+ self.embed_dim = embed_dim
95
+ self.num_embeddings = num_embeddings
96
+ self.latent_channels = latent_channels
97
+ self.in_channels = in_channels
98
+ self.initializer_range = initializer_range
99
+
100
+
101
+ class GlmImageVisionConfig(Glm4vVisionConfig):
102
+ r"""
103
+ This is the configuration class to store the configuration of a [`GlmImageVisionModel`]. It is used to instantiate an GlmImageVisionModel
104
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield
105
+ a similar configuration to that of
106
+ GLM-Image [zai-org/GLM-Image](https://huggingface.co/zai-org/GLM-Image).
107
+
108
+ Args:
109
+ depth (`int`, *optional*, defaults to 40):
110
+ Number of layers (depth) in the model.
111
+ hidden_size (`int`, *optional*, defaults to 1536):
112
+ Dimensionality of the encoder layers and the pooler layer.
113
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
114
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
115
+ `"relu"`, `"selu"` and `"gelu_new"` are supported.
116
+ attention_bias (`bool`, *optional*, defaults to `True`):
117
+ Whether to add a bias to the queries, keys and values.
118
+ attention_dropout (`float`, *optional*, defaults to 0.0):
119
+ Dropout probability for attention weights.
120
+ num_heads (`int`, *optional*, defaults to 16):
121
+ Number of attention heads for each attention layer in the Transformer architecture.
122
+ in_channels (`int`, *optional*, defaults to 3):
123
+ Number of input channels.
124
+ image_size (`int` or `list[int]`, *optional*, defaults to 2048):
125
+ The size (resolution) of each image.
126
+ patch_size (`int`, *optional*, defaults to 16):
127
+ The size (resolution) of each patch.
128
+ layer_norm_eps (`float`, *optional*, defaults to 1e-06):
129
+ The epsilon used by the layer normalization layers.
130
+ spatial_merge_size (`int`, *optional*, defaults to 1):
131
+ The size used for merging spatial dimensions.
132
+ intermediate_size (`int`, *optional*, defaults to 6144):
133
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
134
+ initializer_range (`float`, *optional*, defaults to 0.02):
135
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
136
+ """
137
+
138
+ model_type = "glm_image_vision"
139
+ base_config_key = "vision_config"
140
+
141
+ def __init__(
142
+ self,
143
+ depth=40,
144
+ hidden_size=1536,
145
+ hidden_act="gelu",
146
+ attention_bias=True,
147
+ attention_dropout=0.0,
148
+ num_heads=16,
149
+ in_channels=3,
150
+ image_size=2048,
151
+ patch_size=16,
152
+ layer_norm_eps=1e-06,
153
+ spatial_merge_size=1,
154
+ intermediate_size=6144,
155
+ initializer_range=0.02,
156
+ **kwargs,
157
+ ):
158
+ super().__init__(**kwargs)
159
+ del self.out_hidden_size
160
+ del self.rms_norm_eps
161
+ del self.temporal_patch_size
162
+ self.layer_norm_eps = layer_norm_eps
163
+
164
+
165
+ class GlmImageTextConfig(Glm4vTextConfig):
166
+ r"""
167
+ This is the configuration class to store the configuration of a [`GlmImageTextModel`]. It is used to instantiate a
168
+ GLM-Image model according to the specified arguments, defining the model architecture. Instantiating a
169
+ configuration with the defaults will yield a similar configuration to that of
170
+ GLM-Image [zai-org/GLM-Image](https://huggingface.co/zai-org/GLM-Image).
171
+
172
+ Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
173
+ documentation from [`PreTrainedConfig`] for more information.
174
+
175
+ Args:
176
+ vocab_size (`int`, *optional*, defaults to 168064):
177
+ Vocabulary size of the GlmImage model. Defines the number of different tokens that can be represented by the
178
+ `inputs_ids` passed when calling [`GlmImageModel`]
179
+ hidden_size (`int`, *optional*, defaults to 4096):
180
+ Dimension of the hidden representations.
181
+ intermediate_size (`int`, *optional*, defaults to 13696):
182
+ Dimension of the MLP representations.
183
+ num_hidden_layers (`int`, *optional*, defaults to 40):
184
+ Number of hidden layers in the Transformer encoder.
185
+ num_attention_heads (`int`, *optional*, defaults to 32):
186
+ Number of attention heads for each attention layer in the Transformer encoder.
187
+ num_key_value_heads (`int`, *optional*, defaults to 2):
188
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
189
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
190
+ `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
191
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
192
+ by meanpooling all the original heads within that group. For more details checkout [this
193
+ paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`.
194
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
195
+ The non-linear activation function (function or string) in the decoder.
196
+ max_position_embeddings (`int`, *optional*, defaults to 32768):
197
+ The maximum sequence length that this model might ever be used with.
198
+ initializer_range (`float`, *optional*, defaults to 0.02):
199
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
200
+ rms_norm_eps (`float`, *optional*, defaults to 1e-05):
201
+ The epsilon used by the rms normalization layers.
202
+ use_cache (`bool`, *optional*, defaults to `True`):
203
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
204
+ relevant if `config.is_decoder=True`.
205
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
206
+ Whether the model's input and output word embeddings should be tied.
207
+ attention_dropout (`float`, *optional*, defaults to 0.0):
208
+ The dropout ratio for the attention probabilities.
209
+ rope_parameters (`RopeParameters`, *optional*):
210
+ Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
211
+ a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
212
+ with longer `max_position_embeddings`.
213
+ vision_vocab_size (`int`, *optional*, defaults to 16512):
214
+ Vision vocabulary size of the GlmImage model. Defines the number of different tokens that can be represented
215
+ by the `inputs_ids` passed when calling [`GlmImageVisionModel`]
216
+ attention_bias (`bool`, *optional*, defaults to `True`):
217
+ Whether to add a bias to the queries, keys and values.
218
+
219
+ ```python
220
+ >>> from transformers import GlmImageTextModel, GlmImageConfig
221
+
222
+ >>> # Initializing a GlmImageConfig style configuration
223
+ >>> configuration = GlmImageConfig()
224
+
225
+ >>> # Initializing a model from the GlmImageConfig style configuration
226
+ >>> model = GlmImageTextModel(configuration)
227
+
228
+ >>> # Accessing the model configuration
229
+ >>> configuration = model.config
230
+ ```"""
231
+
232
+ def __init__(
233
+ self,
234
+ vocab_size: int | None = 168064,
235
+ vision_vocab_size: int | None = 16512,
236
+ attention_bias: bool | None = True,
237
+ tie_word_embeddings: bool | None = False,
238
+ **super_kwargs,
239
+ ):
240
+ self.vocab_size = vocab_size
241
+ self.vision_vocab_size = vision_vocab_size
242
+ self.attention_bias = attention_bias
243
+ super().__init__(
244
+ tie_word_embeddings=tie_word_embeddings, ignore_keys_at_rope_validation={"mrope_section"}, **super_kwargs
245
+ )
246
+
247
+
248
+ class GlmImageConfig(PreTrainedConfig):
249
+ r"""
250
+ This is the configuration class to store the configuration of a [`GlmImageModel`]. It is used to instantiate a
251
+ GLM-Image model according to the specified arguments, defining the model architecture. Instantiating a
252
+ configuration with the defaults will yield a similar configuration to that of
253
+ GLM-Image [zai-org/GLM-Image](https://huggingface.co/zai-org/GLM-Image) architecture.
254
+
255
+ Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
256
+ documentation from [`PreTrainedConfig`] for more information.
257
+
258
+ Args:
259
+ text_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `GlmImageTextConfig`):
260
+ The config object or dictionary of the text backbone.
261
+ vision_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `GlmImageVisionConfig`):
262
+ The config object or dictionary of the vision backbone.
263
+ vq_config (`Union[Dict, GlmImageVQVAEConfig]`, *optional*):
264
+ GlmImageVQVAEConfig instance containing the configuration for the VQ-VAE model.
265
+ image_token_id (`int`, *optional*, defaults to 167855):
266
+ The image token index to encode the image prompt.
267
+ image_start_token_id (`int`, *optional*, defaults to 16384):
268
+ The image start token index to encode the start of image.
269
+ image_end_token_id (`int`, *optional*, defaults to 16385):
270
+ The image end token index to encode the end of image.
271
+
272
+ ```python
273
+ >>> from transformers import Glm4vForConditionalGeneration, Glm4vConfig
274
+
275
+ >>> # Initializing a GLM-Image style configuration
276
+ >>> configuration = Glm4vConfig()
277
+
278
+ >>> # Initializing a model from the GLM-Image style configuration
279
+ >>> model = Glm4vForConditionalGeneration(configuration)
280
+
281
+ >>> # Accessing the model configuration
282
+ >>> configuration = model.config
283
+ ```"""
284
+
285
+ model_type = "glm_image"
286
+ sub_configs = {
287
+ "vision_config": GlmImageVisionConfig,
288
+ "text_config": GlmImageTextConfig,
289
+ "vq_config": GlmImageVQVAEConfig,
290
+ }
291
+ keys_to_ignore_at_inference = ["past_key_values"]
292
+
293
+ def __init__(
294
+ self,
295
+ text_config=None,
296
+ vision_config=None,
297
+ vq_config=None,
298
+ image_token_id=167855,
299
+ image_start_token_id=16384,
300
+ image_end_token_id=16385,
301
+ **kwargs,
302
+ ):
303
+ if isinstance(vision_config, dict):
304
+ vision_config = self.sub_configs["vision_config"](**vision_config)
305
+ elif vision_config is None:
306
+ vision_config = self.sub_configs["vision_config"](**kwargs)
307
+
308
+ if isinstance(vq_config, dict):
309
+ vq_config = self.sub_configs["vq_config"](**vq_config)
310
+ elif vq_config is None:
311
+ vq_config = self.sub_configs["vq_config"](**kwargs)
312
+
313
+ if isinstance(text_config, dict):
314
+ text_config = self.sub_configs["text_config"](**text_config)
315
+ elif text_config is None:
316
+ text_config = self.sub_configs["text_config"](**kwargs)
317
+
318
+ self.image_token_id = image_token_id
319
+ self.image_start_token_id = image_start_token_id
320
+ self.image_end_token_id = image_end_token_id
321
+ self.text_config = text_config
322
+ self.vision_config = vision_config
323
+ self.vq_config = vq_config
324
+ super().__init__(**kwargs)
325
+
326
+
327
+ class GlmImageVisionMLP(SiglipMLP):
328
+ pass
329
+
330
+
331
+ class GlmImageVisionAttention(Glm4vVisionAttention):
332
+ def __init__(self, config: GlmImageVisionConfig) -> None:
333
+ super().__init__(config)
334
+ self.qkv = nn.Linear(config.hidden_size, config.hidden_size * 3, bias=config.attention_bias)
335
+ self.proj = nn.Linear(config.hidden_size, config.hidden_size, bias=config.attention_bias)
336
+
337
+ def forward(
338
+ self,
339
+ hidden_states: torch.Tensor,
340
+ cu_seqlens: torch.Tensor,
341
+ **kwargs,
342
+ ) -> torch.Tensor:
343
+ seq_length = hidden_states.shape[0]
344
+ query_states, key_states, value_states = (
345
+ self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
346
+ )
347
+ query_states = query_states.transpose(0, 1).unsqueeze(0)
348
+ key_states = key_states.transpose(0, 1).unsqueeze(0)
349
+ value_states = value_states.transpose(0, 1).unsqueeze(0)
350
+
351
+ attention_interface: Callable = eager_attention_forward
352
+ if self.config._attn_implementation != "eager":
353
+ attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
354
+
355
+ if "flash" in self.config._attn_implementation:
356
+ # Flash Attention: Use cu_seqlens for variable length attention
357
+ max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
358
+ attn_output, _ = attention_interface(
359
+ self,
360
+ query_states,
361
+ key_states,
362
+ value_states,
363
+ attention_mask=None,
364
+ scaling=self.scaling,
365
+ dropout=0.0 if not self.training else self.attention_dropout,
366
+ cu_seq_lens_q=cu_seqlens,
367
+ cu_seq_lens_k=cu_seqlens,
368
+ max_length_q=max_seqlen,
369
+ max_length_k=max_seqlen,
370
+ is_causal=False,
371
+ **kwargs,
372
+ )
373
+ else:
374
+ # Other implementations: Process each chunk separately
375
+ lengths = cu_seqlens[1:] - cu_seqlens[:-1]
376
+ splits = [
377
+ torch.split(tensor, lengths.tolist(), dim=2) for tensor in (query_states, key_states, value_states)
378
+ ]
379
+
380
+ attn_outputs = [
381
+ attention_interface(
382
+ self,
383
+ q,
384
+ k,
385
+ v,
386
+ attention_mask=None,
387
+ scaling=self.scaling,
388
+ dropout=0.0 if not self.training else self.attention_dropout,
389
+ is_causal=False,
390
+ **kwargs,
391
+ )[0]
392
+ for q, k, v in zip(*splits)
393
+ ]
394
+ attn_output = torch.cat(attn_outputs, dim=1)
395
+
396
+ attn_output = attn_output.reshape(seq_length, -1).contiguous()
397
+ attn_output = self.proj(attn_output)
398
+ return attn_output
399
+
400
+
401
+ class GlmImageVisionPatchEmbed(Glm4vVisionPatchEmbed):
402
+ def __init__(self, config: GlmImageVisionConfig) -> None:
403
+ super().__init__(config)
404
+
405
+ del self.temporal_patch_size
406
+ kernel_size = [self.patch_size, self.patch_size]
407
+ self.proj = nn.Conv2d(self.in_channels, self.embed_dim, kernel_size=kernel_size, stride=kernel_size)
408
+
409
+ def forward(self, hidden_states):
410
+ target_dtype = self.proj.weight.dtype
411
+ hidden_states = hidden_states.view(-1, self.in_channels, self.patch_size, self.patch_size)
412
+ hidden_states = self.proj(hidden_states.to(dtype=target_dtype)).view(-1, self.embed_dim)
413
+ return hidden_states
414
+
415
+
416
+ class GlmImageVisionEmbeddings(Glm4vVisionEmbeddings):
417
+ def __init__(self, config: GlmImageVisionConfig) -> None:
418
+ super().__init__(config)
419
+ self.interpolated_method = "bilinear"
420
+
421
+
422
+ class GlmImageVisionBlock(Glm4vVisionBlock):
423
+ def __init__(self, config: GlmImageVisionConfig):
424
+ super().__init__(config)
425
+ self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
426
+ self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
427
+ self.attn = GlmImageVisionAttention(config)
428
+ self.mlp = GlmImageVisionMLP(config)
429
+
430
+ def forward(
431
+ self,
432
+ hidden_states: torch.Tensor,
433
+ cu_seqlens: torch.Tensor,
434
+ **kwargs: Unpack[TransformersKwargs],
435
+ ) -> torch.Tensor:
436
+ r"""
437
+ cu_seqlens (`torch.Tensor` of shape `(num_images_or_videos + 1,)`):
438
+ The cumulative sequence lengths of each image or video feature.
439
+ position_embeddings (`tuple(torch.Tensor, torch.Tensor)` of shape `(num_patches, head_dim // 2)`):
440
+ The cosine and sine position embeddings for vision attention.
441
+ """
442
+ residual = hidden_states
443
+
444
+ hidden_states = self.norm1(hidden_states)
445
+ hidden_states = self.attn(
446
+ hidden_states,
447
+ cu_seqlens=cu_seqlens,
448
+ **kwargs,
449
+ )
450
+ hidden_states = residual + hidden_states
451
+
452
+ residual = hidden_states
453
+ hidden_states = self.norm2(hidden_states)
454
+ hidden_states = self.mlp(hidden_states)
455
+ hidden_states = residual + hidden_states
456
+
457
+ return hidden_states
458
+
459
+
460
+ class GlmImageTextAttention(Glm4vMoeTextAttention):
461
+ pass
462
+
463
+
464
+ class GlmImagePreTrainedModel(Glm4vPreTrainedModel):
465
+ config: GlmImageConfig
466
+ input_modalities = ("image", "text")
467
+
468
+ @torch.no_grad()
469
+ def _init_weights(self, module):
470
+ PreTrainedModel._init_weights(module)
471
+
472
+
473
+ class GlmImageModelOutputWithPast(Glm4vModelOutputWithPast):
474
+ pass
475
+
476
+
477
+ class GlmImageVQVAEVectorQuantizer(ChameleonVQVAEVectorQuantizer):
478
+ def __init__(self, config: GlmImageVQVAEConfig):
479
+ super().__init__(config)
480
+ self.num_embeddings = config.num_embeddings
481
+ self.embedding_dim = config.embed_dim
482
+ self.beta = getattr(config, "beta", 0.25)
483
+
484
+ self.embedding = nn.Embedding(self.num_embeddings, self.embedding_dim)
485
+
486
+ def forward(self, hidden_state: torch.Tensor):
487
+ hidden_state = hidden_state.permute(0, 2, 3, 1).contiguous()
488
+ hidden_state_flattened = hidden_state.view(-1, self.embedding_dim)
489
+
490
+ # L2 normalize
491
+ hidden_state = F.normalize(hidden_state, p=2, dim=-1)
492
+ hidden_state_flattened = F.normalize(hidden_state_flattened, p=2, dim=-1)
493
+ embedding = F.normalize(self.embedding.weight, p=2, dim=-1)
494
+
495
+ # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
496
+ distances = (
497
+ torch.sum(hidden_state_flattened**2, dim=1, keepdim=True)
498
+ + torch.sum(embedding**2, dim=1)
499
+ - 2 * torch.einsum("bd,dn->bn", hidden_state_flattened, embedding.transpose(0, 1))
500
+ )
501
+
502
+ min_encoding_indices = torch.argmin(distances, dim=1)
503
+ hidden_state_quant = embedding[min_encoding_indices].view(hidden_state.shape)
504
+
505
+ # compute loss for embedding
506
+ loss = torch.mean((hidden_state_quant.detach() - hidden_state) ** 2) + self.beta * torch.mean(
507
+ (hidden_state_quant - hidden_state.detach()) ** 2
508
+ )
509
+
510
+ # preserve gradients
511
+ hidden_state_quant = hidden_state + (hidden_state_quant - hidden_state).detach()
512
+
513
+ # reshape back to match original input shape
514
+ hidden_state_quant = hidden_state_quant.permute(0, 3, 1, 2).contiguous()
515
+
516
+ return hidden_state_quant, loss, min_encoding_indices
517
+
518
+
519
+ class GlmImageVQVAE(ChameleonVQVAE):
520
+ _no_split_modules = [
521
+ "GlmImageVQVAEVectorQuantizer",
522
+ ]
523
+
524
+ def __init__(self, config: GlmImageVQVAEConfig):
525
+ super().__init__(config)
526
+ del self.encoder
527
+
528
+ def encode(self, hidden_states):
529
+ hidden_states = self.quant_conv(hidden_states)
530
+ quant, emb_loss, indices = self.quantize(hidden_states)
531
+ return quant, emb_loss, indices
532
+
533
+
534
+ class GlmImageVisionModel(Glm4vVisionModel):
535
+ config: GlmImageVisionConfig
536
+ main_input_name = "pixel_values"
537
+ input_modalities = ("image",)
538
+
539
+ def __init__(self, config: GlmImageVisionConfig):
540
+ super().__init__(config)
541
+
542
+ head_dim = config.hidden_size // config.num_heads
543
+ self.head_dim = head_dim
544
+
545
+ del self.merger
546
+ del self.rotary_pos_emb
547
+ del self.post_conv_layernorm
548
+ del self.downsample
549
+ del self.post_layernorm
550
+
551
+ def rot_pos_emb(self, grid_thw):
552
+ pos_ids = []
553
+ for t, h, w in grid_thw:
554
+ hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
555
+ hpos_ids = hpos_ids.reshape(
556
+ h // self.spatial_merge_size,
557
+ self.spatial_merge_size,
558
+ w // self.spatial_merge_size,
559
+ self.spatial_merge_size,
560
+ )
561
+ hpos_ids = hpos_ids.permute(0, 2, 1, 3)
562
+ hpos_ids = hpos_ids.flatten()
563
+
564
+ wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
565
+ wpos_ids = wpos_ids.reshape(
566
+ h // self.spatial_merge_size,
567
+ self.spatial_merge_size,
568
+ w // self.spatial_merge_size,
569
+ self.spatial_merge_size,
570
+ )
571
+ wpos_ids = wpos_ids.permute(0, 2, 1, 3)
572
+ wpos_ids = wpos_ids.flatten()
573
+ pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
574
+ pos_ids = torch.cat(pos_ids, dim=0)
575
+ return pos_ids
576
+
577
+ def forward(self, pixel_values: torch.Tensor, grid_thw: torch.Tensor, **kwargs) -> torch.Tensor:
578
+ """
579
+ Args:
580
+ pixel_values (`torch.Tensor` of shape `(total_patches, num_channels * patch_size * patch_size)`):
581
+ Packed pixel values.
582
+ grid_thw (`torch.Tensor` of shape `(num_images, 3)`):
583
+ The temporal, height and width of feature shape of each image.
584
+
585
+ Returns:
586
+ `torch.Tensor` of shape `(total_patches, hidden_size)`: Hidden states.
587
+ """
588
+
589
+ hidden_states = self.patch_embed(pixel_values)
590
+ image_type_ids = self.rot_pos_emb(grid_thw)
591
+
592
+ cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
593
+ dim=0,
594
+ dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
595
+ )
596
+ cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
597
+ seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
598
+ hidden_states = self.embeddings(
599
+ hidden_states,
600
+ seqlens,
601
+ grid_thw,
602
+ image_type_ids[:, 0].to(hidden_states.device),
603
+ image_type_ids[:, 1].to(hidden_states.device),
604
+ )
605
+
606
+ # Transformer blocks (no position_embeddings needed, already added above)
607
+ for blk in self.blocks:
608
+ hidden_states = blk(
609
+ hidden_states,
610
+ cu_seqlens=cu_seqlens,
611
+ )
612
+ return hidden_states
613
+
614
+
615
+ class GlmImageTextModel(Glm4vTextModel):
616
+ pass
617
+
618
+
619
+ class GlmImageModel(Glm4vModel):
620
+ def __init__(self, config):
621
+ super().__init__(config)
622
+ self.visual = GlmImageVisionModel._from_config(config.vision_config)
623
+ self.language_model = GlmImageTextModel._from_config(config.text_config)
624
+ self.vqmodel = GlmImageVQVAE._from_config(config.vq_config)
625
+
626
+ self.rope_deltas = None # cache rope_deltas here
627
+
628
+ # Initialize weights and apply final processing
629
+ self.post_init()
630
+
631
+ def get_rope_index(
632
+ self,
633
+ input_ids: torch.LongTensor | None = None,
634
+ image_grid_thw: torch.LongTensor | None = None,
635
+ attention_mask: torch.LongTensor | None = None,
636
+ ) -> tuple[torch.Tensor, torch.Tensor]:
637
+ """
638
+ Calculate the 3D rope index for image generation task.
639
+
640
+ Explanation:
641
+ Each embedding sequence may contain image tokens (for generation) and text tokens,
642
+ or just text tokens.
643
+
644
+ Input format:
645
+ - Text-to-Image: [text tokens] + <|dit_token_16384|>
646
+ - Image-to-Image: <|dit_token_16384|> [image tokens] <|dit_token_16385|> + [text tokens] + <|dit_token_16384|>
647
+
648
+ For pure text embedding sequence, the rotary position embedding is the same across all 3 dimensions.
649
+ Examples:
650
+ input_ids: [T T T T T], here T is for text.
651
+ temporal position_ids: [0, 1, 2, 3, 4]
652
+ height position_ids: [0, 1, 2, 3, 4]
653
+ width position_ids: [0, 1, 2, 3, 4]
654
+
655
+ For sequences with image tokens, we use special markers to denote image regions:
656
+ - <|dit_token_16384|>: image start marker
657
+ - <|dit_token_16385|>: image end marker
658
+ - Image tokens between these markers use 2D spatial position encoding.
659
+
660
+ For image tokens:
661
+ - temporal: stays constant at (image_start_pos + 1)
662
+ - height: increments every w tokens, representing row position
663
+ - width: cycles from 0 to w-1, representing column position
664
+
665
+ After each image region, the next position jumps to: image_start_pos + 1 + max(h, w)
666
+ This ensures sufficient positional separation between images and subsequent tokens.
667
+
668
+ Examples:
669
+ === Case 1: Image-to-Image Generation ===
670
+
671
+ Source image with grid [1, 3, 2], followed by text, then generation.
672
+ input_ids: [<|dit_token_16384|> V V V V V V <|dit_token_16385|> T T T T <|dit_token_16384|>]
673
+ image_grid_thw: [[1, 3, 2], [1, 4, 4]] # first is source, second is target
674
+
675
+ For source image (h=3, w=2, 6 tokens):
676
+ Start marker at position 0
677
+ Image tokens at temporal=1, height=[1,1,2,2,3,3], width=[1,2,1,2,1,2]
678
+ End marker at position 4 (= 0 + 1 + max(3,2))
679
+
680
+ Text tokens and trailing start marker continue from position 5.
681
+
682
+ Full prefill position_ids:
683
+ temporal: [0, 1,1,1,1,1,1, 4, 5,6,7,8, 9]
684
+ height: [0, 1,1,2,2,3,3, 4, 5,6,7,8, 9]
685
+ width: [0, 1,2,1,2,1,2, 4, 5,6,7,8, 9]
686
+
687
+ Decode stage: use image_grid_thw[-1] = [1, 4, 4] to build cached position_ids,
688
+ starting from gen_st_idx = 10.
689
+
690
+ === Case 2: Text-to-Image Generation (multi-resolution) ===
691
+
692
+ Pure text input with two image_grids for progressive generation.
693
+ input_ids: [hello<sop>3 3<eop><sop>3 2<eop><|dit_token_16384|>]
694
+ Assume "hello<sop>3 3<eop><sop>3 2<eop>" = 4 tokens (positions 0-3)
695
+ <|dit_token_16384|> at position 4
696
+ image_grid_thw: [[1, 3, 3], [1, 3, 2]]
697
+ - image_grid_thw[-1] = [1, 3, 2]: first generated image (smaller/draft)
698
+ - image_grid_thw[-2] = [1, 3, 3]: second generated image (larger/final)
699
+
700
+ Prefill position_ids (5 tokens: 4 text + 1 start marker):
701
+ temporal: [0, 1, 2, 3, 4]
702
+ height: [0, 1, 2, 3, 4]
703
+ width: [0, 1, 2, 3, 4]
704
+
705
+ Decode stage builds position_ids in reverse order of image_grid_thw:
706
+
707
+ First: image_grid_thw[-1] = [1, 3, 2] (6 tokens), starting at position 5:
708
+ temporal: [5, 5, 5, 5, 5, 5]
709
+ height: [5, 5, 6, 6, 7, 7]
710
+ width: [5, 6, 5, 6, 5, 6]
711
+ next_pos = 5 + max(3, 2) = 8
712
+
713
+ Then: image_grid_thw[-2] = [1, 3, 3] (9 tokens), starting at position 8:
714
+ temporal: [8, 8, 8, 8, 8, 8, 8, 8, 8]
715
+ height: [8, 8, 8, 9, 9, 9, 10, 10, 10]
716
+ width: [8, 9, 10, 8, 9, 10, 8, 9, 10]
717
+ next_pos = 8 + max(3, 3) = 11
718
+
719
+ Finally: <|dit_token_16385|> end marker at position 11
720
+
721
+ Full sequence position_ids (prefill + decode):
722
+ temporal: [0,1,2,3, 4, 5,5,5,5,5,5, 8,8,8,8,8,8,8,8,8, 11]
723
+ height: [0,1,2,3, 4, 5,5,6,6,7,7, 8,8,8,9,9,9,10,10,10, 11]
724
+ width: [0,1,2,3, 4, 5,6,5,6,5,6, 8,9,10,8,9,10,8,9,10, 11]
725
+
726
+ _cached_decode_position_ids shape: [3, 6 + 9 + 1] = [3, 16]
727
+ (includes all generated image tokens + end marker)
728
+
729
+ Args:
730
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
731
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default
732
+ should you provide it.
733
+ image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
734
+ The temporal, height and width of feature shape of each image. For image generation,
735
+ temporal is typically 1.
736
+ - For image-to-image: includes source image grids + target image grid(s)
737
+ - For text-to-image with multi-resolution: includes multiple target grids,
738
+ processed in reverse order (last grid first, second-to-last grid second, etc.)
739
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
740
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
741
+ - 1 for tokens that are **not masked**,
742
+ - 0 for tokens that are **masked**.
743
+
744
+ Returns:
745
+ position_ids (`torch.LongTensor` of shape `(3, batch_size, sequence_length)`):
746
+ Position IDs for temporal, height, and width dimensions.
747
+ mrope_position_deltas (`torch.Tensor` of shape `(batch_size, 1)`):
748
+ Position deltas for multi-modal rotary position embedding (zeros for this task).
749
+ """
750
+
751
+ batch_size, seq_len = input_ids.shape
752
+ device = input_ids.device
753
+ dtype = input_ids.dtype
754
+
755
+ image_start_token_id = self.config.image_start_token_id
756
+ image_end_token_id = self.config.image_end_token_id
757
+ num_complete_images = (input_ids == image_end_token_id).sum().item()
758
+
759
+ position_ids = torch.ones(
760
+ 3, input_ids.shape[0], input_ids.shape[1], dtype=input_ids.dtype, device=input_ids.device
761
+ )
762
+ text_positions = torch.arange(seq_len)[None, :].repeat(3, 1)
763
+ for batch_idx in range(batch_size):
764
+ curr_input_ids = input_ids[batch_idx]
765
+ if attention_mask is not None:
766
+ curr_input_ids = curr_input_ids[attention_mask[batch_idx] == 1]
767
+
768
+ image_end = torch.where(curr_input_ids == image_end_token_id)[0]
769
+ image_start = torch.where(curr_input_ids == image_start_token_id)[0] + 1
770
+ current_pos = 0 # track the current position value
771
+ prev_image_end = 0
772
+ curr_position_ids = []
773
+ for start, end, grid in zip(image_start, image_end, image_grid_thw):
774
+ _, num_width_grid, num_height_grid = grid
775
+
776
+ # Create text position ids first if there are text tokens before image
777
+ llm_pos_length = start - prev_image_end
778
+ llm_position_ids = text_positions[:, current_pos : current_pos + llm_pos_length].to(
779
+ device=input_ids.device
780
+ )
781
+ current_pos += llm_position_ids.shape[-1]
782
+
783
+ # Now create image position ids for each grid
784
+ image_seq_length = num_height_grid * num_width_grid
785
+ h_grids = image_seq_length // num_height_grid + current_pos
786
+ w_grids = image_seq_length // num_width_grid + current_pos
787
+ position_width = torch.arange(current_pos, w_grids, device=input_ids.device).repeat(num_width_grid)
788
+ position_height = torch.arange(current_pos, h_grids, device=input_ids.device).repeat_interleave(
789
+ num_height_grid
790
+ )
791
+ position_temporal = torch.full(
792
+ (image_seq_length,), current_pos, device=input_ids.device, dtype=torch.long
793
+ )
794
+ vision_position_ids = torch.stack([position_temporal, position_height, position_width], dim=0)
795
+ current_pos += max(num_height_grid, num_width_grid)
796
+
797
+ prev_image_end = end
798
+ curr_position_ids.append(torch.cat([llm_position_ids, vision_position_ids], dim=-1))
799
+
800
+ # Add position ids for the last text tokens if any
801
+ end_position = len(curr_input_ids) - prev_image_end
802
+ llm_position_ids = text_positions[:, current_pos : current_pos + end_position].to(device=input_ids.device)
803
+ current_pos += llm_position_ids.shape[-1]
804
+ curr_position_ids.append(llm_position_ids)
805
+ curr_position_ids = torch.cat(curr_position_ids, dim=-1)
806
+ if attention_mask is not None:
807
+ position_ids[:, batch_idx, attention_mask[batch_idx] == 1] = curr_position_ids.to(position_ids.device)
808
+ else:
809
+ position_ids[:, batch_idx, :] = curr_position_ids.to(position_ids.device)
810
+
811
+ # Build and store position ids for tokens that will be generated. Later we will just
812
+ # slice these instead of computing each decoding step
813
+ self._prefill_len = seq_len
814
+ if image_grid_thw is not None and len(image_grid_thw) > 0:
815
+ num_decode_grids = len(image_grid_thw) - num_complete_images
816
+ num_decode_grids = max(num_decode_grids, 0)
817
+ decode_pos = current_pos
818
+
819
+ decode_temporal_list = []
820
+ decode_height_list = []
821
+ decode_width_list = []
822
+
823
+ for i in range(1, num_decode_grids + 1):
824
+ grid_idx = -i
825
+ h = image_grid_thw[grid_idx, 1].item()
826
+ w = image_grid_thw[grid_idx, 2].item()
827
+ total_tokens = h * w
828
+
829
+ h_indices = torch.arange(h, device=device).unsqueeze(1).expand(h, w).flatten()
830
+ w_indices = torch.arange(w, device=device).unsqueeze(0).expand(h, w).flatten()
831
+
832
+ decode_temporal_list.append(torch.full((total_tokens,), decode_pos, device=device, dtype=torch.long))
833
+ decode_height_list.append(decode_pos + h_indices)
834
+ decode_width_list.append(decode_pos + w_indices)
835
+ decode_pos = decode_pos + max(h, w)
836
+
837
+ decode_temporal_list.append(torch.tensor([decode_pos], device=device, dtype=torch.long))
838
+ decode_height_list.append(torch.tensor([decode_pos], device=device, dtype=torch.long))
839
+ decode_width_list.append(torch.tensor([decode_pos], device=device, dtype=torch.long))
840
+
841
+ self._cached_decode_position_ids = torch.stack(
842
+ [
843
+ torch.cat(decode_temporal_list, dim=0),
844
+ torch.cat(decode_height_list, dim=0),
845
+ torch.cat(decode_width_list, dim=0),
846
+ ],
847
+ dim=0,
848
+ )
849
+ else:
850
+ self._cached_decode_position_ids = None
851
+
852
+ mrope_position_deltas = torch.zeros([batch_size, 1], dtype=dtype, device=device)
853
+
854
+ return position_ids, mrope_position_deltas
855
+
856
+ def get_image_tokens(
857
+ self,
858
+ hidden_states: torch.FloatTensor,
859
+ image_grid_thw: torch.LongTensor,
860
+ ) -> torch.LongTensor:
861
+ """
862
+ Tokenizes image features into discrete tokens with VQVAE module.
863
+
864
+ Args:
865
+ hidden_states (`torch.FloatTensor` of shape `(total_patches, hidden_size)`):
866
+ The packed image features from vision encoder.
867
+ image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`):
868
+ The temporal, height and width of feature shape of each image.
869
+
870
+ Returns:
871
+ image_tokens (`torch.LongTensor` of shape `(total_patches,)`):
872
+ Discrete token indices from the VQVAE codebook.
873
+ """
874
+ hidden_size = hidden_states.shape[-1]
875
+ split_sizes = (image_grid_thw.prod(dim=-1)).tolist()
876
+ hidden_states_list = torch.split(hidden_states, split_sizes, dim=0)
877
+
878
+ all_image_toks = []
879
+ for i, hs in enumerate(hidden_states_list):
880
+ grid_t, grid_h, grid_w = image_grid_thw[i].tolist()
881
+ hs = hs.view(grid_t, grid_h, grid_w, hidden_size)
882
+ hs = hs.permute(0, 3, 1, 2).contiguous()
883
+ _, _, image_toks = self.vqmodel.encode(hs)
884
+ all_image_toks.append(image_toks)
885
+ return torch.cat(all_image_toks, dim=0)
886
+
887
+ def get_video_features(self):
888
+ raise AttributeError("Not needed for GlmImage")
889
+
890
+ def get_placeholder_mask(
891
+ self,
892
+ input_ids: torch.LongTensor,
893
+ image_ids: torch.LongTensor,
894
+ ):
895
+ """
896
+ Replace image placeholder tokens in input_ids with actual image token ids from VQVAE.
897
+
898
+ Args:
899
+ input_ids (`torch.LongTensor` of shape `(batch_size, seq_len)`):
900
+ Input token ids with image placeholders.
901
+ image_ids (`torch.LongTensor` of shape `(num_images, num_tokens_per_image)` or flattened):
902
+ Discrete token indices from the VQVAE codebook.
903
+
904
+ Returns:
905
+ special_image_mask (`torch.LongTensor` of shape `(batch_size, seq_len)`):
906
+ Mask indicating positions in input ids that will be replaced by actual image tokens.
907
+ """
908
+
909
+ special_image_mask = input_ids == self.config.image_token_id
910
+ n_placeholder_tokens = special_image_mask.sum().item()
911
+ n_image_tokens = image_ids.shape[0]
912
+
913
+ if n_placeholder_tokens != n_image_tokens:
914
+ raise ValueError(
915
+ f"Number of image placeholder tokens ({n_placeholder_tokens}) does not match "
916
+ f"number of image tokens from VQVAE ({n_image_tokens})"
917
+ )
918
+
919
+ return special_image_mask
920
+
921
+ def forward(
922
+ self,
923
+ input_ids: torch.LongTensor | None = None,
924
+ attention_mask: torch.Tensor | None = None,
925
+ position_ids: torch.LongTensor | None = None,
926
+ past_key_values: Cache | None = None,
927
+ inputs_embeds: torch.FloatTensor | None = None,
928
+ pixel_values: torch.Tensor | None = None,
929
+ image_grid_thw: torch.LongTensor | None = None,
930
+ rope_deltas: torch.LongTensor | None = None,
931
+ cache_position: torch.LongTensor | None = None,
932
+ **kwargs: Unpack[TransformersKwargs],
933
+ ) -> tuple | GlmImageModelOutputWithPast:
934
+ r"""
935
+ image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
936
+ The temporal, height and width of feature shape of each image in LLM.
937
+ rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
938
+ The rope index difference between sequence length and multimodal rope.
939
+ """
940
+ if (input_ids is None) ^ (inputs_embeds is not None):
941
+ raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
942
+
943
+ if pixel_values is not None:
944
+ image_embeds = self.get_image_features(pixel_values, image_grid_thw[:-1])
945
+ image_embeds = torch.cat(image_embeds, dim=0)
946
+ image_ids = self.get_image_tokens(image_embeds, image_grid_thw[:-1])
947
+ image_ids = image_ids.view(-1).to(input_ids.device)
948
+ special_image_mask = self.get_placeholder_mask(input_ids, image_ids)
949
+ input_ids = input_ids.masked_scatter(special_image_mask, image_ids)
950
+
951
+ if inputs_embeds is None:
952
+ inputs_embeds = self.get_input_embeddings()(input_ids)
953
+
954
+ if position_ids is None:
955
+ attention_mask_2d = attention_mask
956
+ if attention_mask is not None and attention_mask.ndim == 4:
957
+ attention_mask_2d = torch.diagonal(attention_mask[:, 0], dim1=1, dim2=2)
958
+ # Only apply conversion for floating point tensors (inverted masks)
959
+ if attention_mask_2d.dtype.is_floating_point:
960
+ attention_mask_2d = attention_mask_2d / torch.finfo(attention_mask_2d.dtype).min
961
+ attention_mask_2d = (1.0 - attention_mask_2d).int()
962
+
963
+ # Calculate RoPE index once per generation in the pre-fill stage only.
964
+ # It is safe to assume that `length!=1` means we're in pre-fill because the
965
+ # model is used only by DiT pipeline without assisted decoding, etc. techniques
966
+ is_prefill_stage = (input_ids is not None and input_ids.shape[1] != 1) or (
967
+ inputs_embeds is not None and inputs_embeds.shape[1] != 1
968
+ )
969
+ if is_prefill_stage or self.rope_deltas is None:
970
+ position_ids, rope_deltas = self.get_rope_index(
971
+ input_ids,
972
+ image_grid_thw,
973
+ attention_mask=attention_mask_2d,
974
+ )
975
+ self.rope_deltas = rope_deltas
976
+ # then use the prev pre-calculated rope-deltas to get the correct position ids
977
+ else:
978
+ batch_size, seq_length, _ = inputs_embeds.shape
979
+ # Use prefill token length, not position value
980
+ step = cache_position[0].item() - self._prefill_len
981
+ # Direct lookup - no tensor creation overhead
982
+ position_ids = self._cached_decode_position_ids[:, step : step + seq_length]
983
+ position_ids = position_ids.unsqueeze(1).expand(-1, batch_size, -1)
984
+
985
+ outputs = self.language_model(
986
+ input_ids=None,
987
+ position_ids=position_ids,
988
+ attention_mask=attention_mask,
989
+ past_key_values=past_key_values,
990
+ inputs_embeds=inputs_embeds,
991
+ cache_position=cache_position,
992
+ **kwargs,
993
+ )
994
+
995
+ return GlmImageModelOutputWithPast(
996
+ last_hidden_state=outputs.last_hidden_state,
997
+ past_key_values=outputs.past_key_values,
998
+ hidden_states=outputs.hidden_states,
999
+ attentions=outputs.attentions,
1000
+ rope_deltas=self.rope_deltas,
1001
+ )
1002
+
1003
+
1004
+ class GlmImageCausalLMOutputWithPast(Glm4vCausalLMOutputWithPast):
1005
+ pass
1006
+
1007
+
1008
+ class GlmImageForConditionalGeneration(GlmImagePreTrainedModel, GenerationMixin):
1009
+ _checkpoint_conversion_mapping = {}
1010
+ _tied_weights_keys = {}
1011
+ # Reference: fix gemma3 grad acc #37208
1012
+ accepts_loss_kwargs = False
1013
+ base_model_prefix = "model"
1014
+ config: GlmImageConfig
1015
+
1016
+ def __init__(self, config):
1017
+ super().__init__(config)
1018
+ self.model = GlmImageModel(config)
1019
+ self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vision_vocab_size, bias=False)
1020
+
1021
+ # Initialize weights and apply final processing
1022
+ self.post_init()
1023
+
1024
+ def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None):
1025
+ return self.model.get_image_features(pixel_values, image_grid_thw)
1026
+
1027
+ def get_image_tokens(self, hidden_states: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None):
1028
+ return self.model.get_image_tokens(hidden_states, image_grid_thw)
1029
+
1030
+ def forward(
1031
+ self,
1032
+ input_ids: torch.LongTensor | None = None,
1033
+ attention_mask: torch.Tensor | None = None,
1034
+ position_ids: torch.LongTensor | None = None,
1035
+ past_key_values: Cache | None = None,
1036
+ inputs_embeds: torch.FloatTensor | None = None,
1037
+ labels: torch.LongTensor | None = None,
1038
+ pixel_values: torch.Tensor | None = None,
1039
+ image_grid_thw: torch.LongTensor | None = None,
1040
+ cache_position: torch.LongTensor | None = None,
1041
+ logits_to_keep: int | torch.Tensor = 0,
1042
+ **kwargs: Unpack[TransformersKwargs],
1043
+ ) -> tuple | GlmImageCausalLMOutputWithPast:
1044
+ r"""
1045
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1046
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
1047
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
1048
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
1049
+ image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
1050
+ The temporal, height and width of feature shape of each image in LLM.
1051
+
1052
+ Example:
1053
+
1054
+ ```python
1055
+ >>> from PIL import Image
1056
+ >>> import requests
1057
+ >>> from transformers import AutoProcessor, GlmImageForConditionalGeneration
1058
+
1059
+ >>> model = GlmImageForConditionalGeneration.from_pretrained("zai-org/GLM-Image")
1060
+ >>> processor = AutoProcessor.from_pretrained("zai-org/GLM-Image")
1061
+
1062
+ >>> messages = [
1063
+ {
1064
+ "role": "user",
1065
+ "content": [
1066
+ {"type": "image"},
1067
+ {"type": "text", "text": "Add a truck of this photo.<sop>28 40<eop>"},
1068
+ ],
1069
+ },
1070
+ ]
1071
+ >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
1072
+ >>> image = Image.open(requests.get(url, stream=True).raw)
1073
+
1074
+ >>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
1075
+ >>> inputs = processor(text=[text], images=[image], vision_infos=[vision_infos])
1076
+
1077
+ >>> # Generate
1078
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
1079
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
1080
+ "The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..."
1081
+ ```"""
1082
+ outputs = self.model(
1083
+ input_ids=input_ids,
1084
+ pixel_values=pixel_values,
1085
+ image_grid_thw=image_grid_thw,
1086
+ position_ids=position_ids,
1087
+ attention_mask=attention_mask,
1088
+ past_key_values=past_key_values,
1089
+ inputs_embeds=inputs_embeds,
1090
+ cache_position=cache_position,
1091
+ **kwargs,
1092
+ )
1093
+
1094
+ hidden_states = outputs[0]
1095
+
1096
+ # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
1097
+ slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
1098
+ logits = self.lm_head(hidden_states[:, slice_indices, :])
1099
+
1100
+ loss = None
1101
+ if labels is not None:
1102
+ loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size)
1103
+
1104
+ return GlmImageCausalLMOutputWithPast(
1105
+ loss=loss,
1106
+ logits=logits,
1107
+ past_key_values=outputs.past_key_values,
1108
+ hidden_states=outputs.hidden_states,
1109
+ attentions=outputs.attentions,
1110
+ rope_deltas=outputs.rope_deltas,
1111
+ )
1112
+
1113
+ def prepare_inputs_for_generation(
1114
+ self,
1115
+ input_ids,
1116
+ past_key_values=None,
1117
+ attention_mask=None,
1118
+ inputs_embeds=None,
1119
+ cache_position=None,
1120
+ position_ids=None,
1121
+ use_cache=True,
1122
+ pixel_values=None,
1123
+ image_grid_thw=None,
1124
+ is_first_iteration=False,
1125
+ **kwargs,
1126
+ ):
1127
+ model_inputs = super().prepare_inputs_for_generation(
1128
+ input_ids,
1129
+ past_key_values=past_key_values,
1130
+ attention_mask=attention_mask,
1131
+ inputs_embeds=inputs_embeds,
1132
+ cache_position=cache_position,
1133
+ position_ids=position_ids,
1134
+ pixel_values=pixel_values,
1135
+ image_grid_thw=image_grid_thw,
1136
+ is_first_iteration=is_first_iteration,
1137
+ use_cache=use_cache,
1138
+ **kwargs,
1139
+ )
1140
+
1141
+ model_inputs["position_ids"] = None
1142
+
1143
+ if not is_first_iteration and use_cache:
1144
+ model_inputs["pixel_values"] = None
1145
+
1146
+ return model_inputs
1147
+
1148
+ def _get_image_nums(
1149
+ self,
1150
+ input_ids: torch.LongTensor | None,
1151
+ ) -> torch.Tensor:
1152
+ """
1153
+ Get the number of images for each sample.
1154
+ For GLM-Image, only input_ids allow us to get the number of images.
1155
+
1156
+ Returns:
1157
+ image_counts (`torch.LongTensor` of shape `(batch_size,)`)
1158
+ """
1159
+ is_image = input_ids == self.config.image_start_token_id
1160
+
1161
+ return is_image.sum(dim=1)
1162
+
1163
+ def _expand_inputs_for_generation(
1164
+ self,
1165
+ expand_size: int = 1,
1166
+ is_encoder_decoder: bool = False,
1167
+ input_ids: torch.LongTensor | None = None,
1168
+ **model_kwargs,
1169
+ ) -> tuple[torch.LongTensor, dict[str, Any]]:
1170
+ # Overwritten -- Support for expanding tensors without a batch size dimension
1171
+ # e.g., pixel_values, image_grid_thw
1172
+ # pixel_values.shape[0] is sum(seqlen_images for samples)
1173
+ # image_grid_thw.shape[0] is sum(num_images for samples)
1174
+
1175
+ if expand_size == 1:
1176
+ return input_ids, model_kwargs
1177
+
1178
+ visual_keys = ["pixel_values", "image_grid_thw"]
1179
+
1180
+ def _expand_dict_for_generation_visual(dict_to_expand):
1181
+ image_grid_thw = model_kwargs.get("image_grid_thw", None)
1182
+ image_nums = self._get_image_nums(input_ids)
1183
+
1184
+ def _repeat_interleave_samples(x, lengths, repeat_times):
1185
+ samples = torch.split(x, lengths)
1186
+ repeat_args = [repeat_times] + [1] * (x.dim() - 1)
1187
+ result = torch.cat([sample.repeat(*repeat_args) for sample in samples], dim=0)
1188
+ return result
1189
+
1190
+ for key in dict_to_expand:
1191
+ if key == "pixel_values":
1192
+ # split images into samples
1193
+ samples = torch.split(image_grid_thw[: sum(image_nums)], list(image_nums))
1194
+ # compute the sequence length of images for each sample
1195
+ lengths = [torch.prod(sample, dim=1).sum() for sample in samples]
1196
+ dict_to_expand[key] = _repeat_interleave_samples(
1197
+ dict_to_expand[key], lengths=lengths, repeat_times=expand_size
1198
+ )
1199
+ elif key == "image_grid_thw":
1200
+ # get the num of images for each sample and +1 for the image being generated
1201
+ lengths = list(image_nums)
1202
+ last_image = dict_to_expand[key][:-1]
1203
+ dict_to_expand[key] = _repeat_interleave_samples(
1204
+ dict_to_expand[key][: sum(image_nums)], lengths=lengths, repeat_times=expand_size
1205
+ )
1206
+ dict_to_expand[key] = torch.cat([dict_to_expand[key], last_image], dim=0)
1207
+ return dict_to_expand
1208
+
1209
+ def _expand_dict_for_generation(dict_to_expand):
1210
+ for key in dict_to_expand:
1211
+ if (
1212
+ key != "cache_position"
1213
+ and dict_to_expand[key] is not None
1214
+ and isinstance(dict_to_expand[key], torch.Tensor)
1215
+ and key not in visual_keys
1216
+ ):
1217
+ dict_to_expand[key] = dict_to_expand[key].repeat_interleave(expand_size, dim=0)
1218
+ return dict_to_expand
1219
+
1220
+ model_kwargs = _expand_dict_for_generation_visual(model_kwargs)
1221
+
1222
+ if input_ids is not None:
1223
+ input_ids = input_ids.repeat_interleave(expand_size, dim=0)
1224
+
1225
+ model_kwargs = _expand_dict_for_generation(model_kwargs)
1226
+
1227
+ if is_encoder_decoder:
1228
+ if model_kwargs.get("encoder_outputs") is None:
1229
+ raise ValueError("If `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined.")
1230
+ model_kwargs["encoder_outputs"] = _expand_dict_for_generation(model_kwargs["encoder_outputs"])
1231
+
1232
+ return input_ids, model_kwargs
1233
+
1234
+
1235
+ def smart_resize(
1236
+ height: int,
1237
+ width: int,
1238
+ factor: int = 16,
1239
+ min_pixels: int = 512 * 512,
1240
+ max_pixels: int = 2048 * 2048,
1241
+ ) -> tuple[int, int]:
1242
+ if height < factor or width < factor:
1243
+ raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}")
1244
+ elif max(height, width) / min(height, width) > 4:
1245
+ raise ValueError(
1246
+ f"absolute aspect ratio must be smaller than 4, got {max(height, width) / min(height, width)}"
1247
+ )
1248
+
1249
+ shortest_edge = int(round(math.sqrt(min_pixels)))
1250
+ longest_edge = int(round(math.sqrt(max_pixels)))
1251
+ min_side = min(height, width)
1252
+ max_side = max(height, width)
1253
+
1254
+ scale = 1.0
1255
+
1256
+ if min_side < shortest_edge:
1257
+ scale = shortest_edge / min_side
1258
+
1259
+ if max_side * scale > longest_edge:
1260
+ scale = longest_edge / max_side
1261
+
1262
+ height = height // 2
1263
+ width = width // 2
1264
+
1265
+ h_bar = max(factor, int(round(height * scale / factor)) * factor)
1266
+ w_bar = max(factor, int(round(width * scale / factor)) * factor)
1267
+
1268
+ if max(h_bar, w_bar) > longest_edge:
1269
+ beta = max(h_bar, w_bar) / longest_edge
1270
+ h_bar = max(factor, int(math.floor((h_bar / beta) / factor)) * factor)
1271
+ w_bar = max(factor, int(math.floor((w_bar / beta) / factor)) * factor)
1272
+
1273
+ return h_bar, w_bar
1274
+
1275
+
1276
+ class GlmImageImageProcessor(Qwen2VLImageProcessor):
1277
+ pass
1278
+
1279
+
1280
+ class GlmImageImageProcessorFast(Qwen2VLImageProcessorFast):
1281
+ pass
1282
+
1283
+
1284
+ class GlmImageImagesKwargs(ImagesKwargs, total=False):
1285
+ """
1286
+ target_h (`int`):
1287
+ Height of the target image to be generated.
1288
+ target_w (`int`):
1289
+ Width of the target image to be generated.
1290
+ """
1291
+
1292
+ target_h: int
1293
+ target_w: int
1294
+
1295
+
1296
+ class GlmImageProcessorKwargs(Qwen2VLProcessorKwargs):
1297
+ images_kwargs: GlmImageImagesKwargs
1298
+
1299
+ _defaults = {
1300
+ "text_kwargs": {
1301
+ "padding": False,
1302
+ "return_mm_token_type_ids": False,
1303
+ },
1304
+ "images_kwargs": {
1305
+ "target_h": 1152,
1306
+ "target_w": 768,
1307
+ },
1308
+ }
1309
+
1310
+
1311
+ class GlmImageProcessor(ProcessorMixin):
1312
+ r"""
1313
+ Constructs a GLM-Image processor which wraps a GLM-Image image processor and a GLM-Image tokenizer into a single processor.
1314
+ [`~GlmImageProcessor.__call__`] and [`~GlmImageProcessor.decode`] for more information.
1315
+ Args:
1316
+ image_processor ([`GlmImageProcessor`], *optional*):
1317
+ The image processor is a required input.
1318
+ tokenizer ([`PreTrainedTokenizerFast`], *optional*):
1319
+ The tokenizer is a required input.
1320
+ chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
1321
+ in a chat into a tokenizable string.
1322
+ """
1323
+
1324
+ def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
1325
+ self.image_token = tokenizer.image_token
1326
+ self.grid_bos_token = tokenizer.grid_bos_token
1327
+ self.grid_eos_token = tokenizer.grid_eos_token
1328
+ self.bos_token = tokenizer.bos_token
1329
+ self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
1330
+ super().__init__(image_processor, tokenizer, chat_template=chat_template)
1331
+
1332
+ def __call__(
1333
+ self,
1334
+ images: ImageInput | None = None,
1335
+ text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] = None,
1336
+ **kwargs: Unpack[GlmImageProcessorKwargs],
1337
+ ) -> BatchFeature:
1338
+ """
1339
+ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
1340
+ and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] if `text` is not `None` to encode
1341
+ the text.
1342
+
1343
+ Args:
1344
+ images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
1345
+ The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
1346
+ tensor. Both channels-first and channels-last formats are supported.
1347
+ text (`str`, `List[str]`, `List[List[str]]`):
1348
+ The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
1349
+ (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
1350
+ `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
1351
+ return_tensors (`str` or [`~utils.TensorType`], *optional*):
1352
+ If set, will return tensors of a particular framework. Acceptable values are:
1353
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
1354
+ - `'np'`: Return NumPy `np.ndarray` objects.
1355
+
1356
+ Returns:
1357
+ [`BatchFeature`]: A [`BatchFeature`] with the following fields:
1358
+
1359
+ - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
1360
+ - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
1361
+ `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
1362
+ `None`).
1363
+ - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
1364
+ - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
1365
+ """
1366
+ output_kwargs = self._merge_kwargs(
1367
+ GlmImageProcessorKwargs,
1368
+ tokenizer_init_kwargs=self.tokenizer.init_kwargs,
1369
+ **kwargs,
1370
+ )
1371
+ target_h = output_kwargs["images_kwargs"].pop("target_h", None)
1372
+ target_w = output_kwargs["images_kwargs"].pop("target_w", None)
1373
+ is_text_to_image = images is None
1374
+
1375
+ if images is not None:
1376
+ image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
1377
+ image_grid_thw = image_inputs["image_grid_thw"]
1378
+ else:
1379
+ image_inputs = {}
1380
+ image_grid_thw = None
1381
+
1382
+ if not isinstance(text, list):
1383
+ text = [text]
1384
+
1385
+ if len(text) > 1:
1386
+ raise ValueError("The model does not support batch size > 1")
1387
+
1388
+ text = text.copy() # below lines change text in-place
1389
+ if not is_text_to_image:
1390
+ index = 0
1391
+ for i in range(len(text)):
1392
+ while self.image_token in text[i]:
1393
+ grid = image_grid_thw[index]
1394
+ num_image_tokens = int(grid[1] * grid[2])
1395
+ text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1)
1396
+ index += 1
1397
+ text[i] = text[i].replace("<|placeholder|>", self.image_token)
1398
+
1399
+ text[0], token_h, token_w, prev_h, prev_w = self._build_prompt_with_target_shape(
1400
+ text[0], height=target_h, width=target_w, is_text_to_image=is_text_to_image
1401
+ )
1402
+ image_inputs["image_grid_thw"] = self._build_target_image_grid_thw(
1403
+ token_h=token_h,
1404
+ token_w=token_w,
1405
+ prev_token_h=prev_h,
1406
+ prev_token_w=prev_w,
1407
+ image_grid_thw=image_grid_thw if not is_text_to_image else None,
1408
+ )
1409
+
1410
+ return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
1411
+ return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False)
1412
+ text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
1413
+ self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
1414
+
1415
+ if return_mm_token_type_ids:
1416
+ array_ids = np.array(text_inputs["input_ids"])
1417
+ mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
1418
+ mm_token_type_ids[array_ids == self.image_token_id] = 1
1419
+ text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
1420
+ return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
1421
+
1422
+ def _build_prompt_with_target_shape(
1423
+ self,
1424
+ prompt: str,
1425
+ height: int,
1426
+ width: int,
1427
+ is_text_to_image: bool,
1428
+ ) -> tuple[str, int, int, int, int]:
1429
+ factor = 32
1430
+ height = (height // factor) * factor
1431
+ width = (width // factor) * factor
1432
+ token_h = height // factor
1433
+ token_w = width // factor
1434
+ ratio = token_h / token_w
1435
+ prev_token_h = int(math.sqrt(ratio) * (factor // 2))
1436
+ prev_token_w = int(math.sqrt(1 / ratio) * (factor // 2))
1437
+
1438
+ if is_text_to_image:
1439
+ expanded_prompt = f"{prompt}{self.grid_bos_token}{token_h} {token_w}{self.grid_eos_token}{self.grid_bos_token}{prev_token_h} {prev_token_w}{self.grid_eos_token}{self.bos_token}"
1440
+ else:
1441
+ expanded_prompt = f"{prompt}{self.grid_bos_token}{token_h} {token_w}{self.grid_eos_token}{self.bos_token}"
1442
+
1443
+ return expanded_prompt, token_h, token_w, prev_token_h, prev_token_w
1444
+
1445
+ @staticmethod
1446
+ def _build_target_image_grid_thw(
1447
+ token_h: int,
1448
+ token_w: int,
1449
+ prev_token_h: int,
1450
+ prev_token_w: int,
1451
+ image_grid_thw: None,
1452
+ ):
1453
+ if image_grid_thw is None:
1454
+ return torch.tensor(
1455
+ [
1456
+ [1, token_h, token_w],
1457
+ [1, prev_token_h, prev_token_w],
1458
+ ],
1459
+ )
1460
+ else:
1461
+ return torch.cat(
1462
+ [image_grid_thw, torch.tensor([[1, token_h, token_w]], device=image_grid_thw.device)], dim=0
1463
+ )
1464
+
1465
+
1466
+ __all__ = [
1467
+ "GlmImageVQVAEConfig",
1468
+ "GlmImageVisionConfig",
1469
+ "GlmImageTextConfig",
1470
+ "GlmImageConfig",
1471
+ "GlmImagePreTrainedModel",
1472
+ "GlmImageVQVAE",
1473
+ "GlmImageVisionModel",
1474
+ "GlmImageTextModel",
1475
+ "GlmImageModel",
1476
+ "GlmImageForConditionalGeneration",
1477
+ "GlmImageImageProcessor",
1478
+ "GlmImageImageProcessorFast",
1479
+ "GlmImageProcessor",
1480
+ ]