transformers 5.0.0rc2__py3-none-any.whl → 5.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (1594) hide show
  1. transformers/__init__.py +11 -37
  2. transformers/activations.py +2 -2
  3. transformers/audio_utils.py +32 -32
  4. transformers/backbone_utils.py +326 -0
  5. transformers/cache_utils.py +26 -126
  6. transformers/cli/chat.py +3 -3
  7. transformers/cli/serve.py +13 -10
  8. transformers/cli/transformers.py +2 -1
  9. transformers/configuration_utils.py +22 -92
  10. transformers/conversion_mapping.py +150 -26
  11. transformers/convert_slow_tokenizer.py +9 -12
  12. transformers/core_model_loading.py +217 -129
  13. transformers/data/processors/glue.py +0 -1
  14. transformers/data/processors/utils.py +0 -1
  15. transformers/data/processors/xnli.py +0 -1
  16. transformers/dependency_versions_check.py +0 -1
  17. transformers/dependency_versions_table.py +10 -11
  18. transformers/distributed/configuration_utils.py +1 -2
  19. transformers/dynamic_module_utils.py +23 -23
  20. transformers/feature_extraction_sequence_utils.py +19 -23
  21. transformers/feature_extraction_utils.py +14 -14
  22. transformers/file_utils.py +0 -2
  23. transformers/generation/candidate_generator.py +2 -4
  24. transformers/generation/configuration_utils.py +54 -39
  25. transformers/generation/continuous_batching/__init__.py +0 -1
  26. transformers/generation/continuous_batching/cache.py +74 -44
  27. transformers/generation/continuous_batching/cache_manager.py +28 -28
  28. transformers/generation/continuous_batching/continuous_api.py +133 -414
  29. transformers/generation/continuous_batching/input_ouputs.py +464 -0
  30. transformers/generation/continuous_batching/requests.py +77 -19
  31. transformers/generation/continuous_batching/scheduler.py +154 -104
  32. transformers/generation/logits_process.py +10 -133
  33. transformers/generation/stopping_criteria.py +1 -2
  34. transformers/generation/streamers.py +0 -1
  35. transformers/generation/utils.py +91 -121
  36. transformers/generation/watermarking.py +2 -3
  37. transformers/hf_argparser.py +9 -13
  38. transformers/hyperparameter_search.py +1 -2
  39. transformers/image_processing_base.py +9 -9
  40. transformers/image_processing_utils.py +11 -15
  41. transformers/image_processing_utils_fast.py +70 -71
  42. transformers/image_transforms.py +73 -42
  43. transformers/image_utils.py +30 -37
  44. transformers/initialization.py +57 -0
  45. transformers/integrations/__init__.py +10 -24
  46. transformers/integrations/accelerate.py +47 -11
  47. transformers/integrations/awq.py +1 -3
  48. transformers/integrations/deepspeed.py +146 -4
  49. transformers/integrations/eetq.py +0 -1
  50. transformers/integrations/executorch.py +2 -6
  51. transformers/integrations/fbgemm_fp8.py +1 -2
  52. transformers/integrations/finegrained_fp8.py +149 -13
  53. transformers/integrations/flash_attention.py +3 -8
  54. transformers/integrations/flex_attention.py +1 -1
  55. transformers/integrations/fp_quant.py +4 -6
  56. transformers/integrations/ggml.py +0 -1
  57. transformers/integrations/hub_kernels.py +18 -7
  58. transformers/integrations/integration_utils.py +2 -3
  59. transformers/integrations/moe.py +226 -106
  60. transformers/integrations/mxfp4.py +52 -40
  61. transformers/integrations/peft.py +488 -176
  62. transformers/integrations/quark.py +2 -4
  63. transformers/integrations/tensor_parallel.py +641 -581
  64. transformers/integrations/torchao.py +4 -6
  65. transformers/loss/loss_lw_detr.py +356 -0
  66. transformers/loss/loss_utils.py +2 -0
  67. transformers/masking_utils.py +199 -59
  68. transformers/model_debugging_utils.py +4 -5
  69. transformers/modelcard.py +14 -192
  70. transformers/modeling_attn_mask_utils.py +19 -19
  71. transformers/modeling_flash_attention_utils.py +28 -29
  72. transformers/modeling_gguf_pytorch_utils.py +5 -5
  73. transformers/modeling_layers.py +21 -22
  74. transformers/modeling_outputs.py +242 -253
  75. transformers/modeling_rope_utils.py +32 -32
  76. transformers/modeling_utils.py +416 -438
  77. transformers/models/__init__.py +10 -0
  78. transformers/models/afmoe/configuration_afmoe.py +40 -33
  79. transformers/models/afmoe/modeling_afmoe.py +38 -41
  80. transformers/models/afmoe/modular_afmoe.py +23 -25
  81. transformers/models/aimv2/configuration_aimv2.py +2 -10
  82. transformers/models/aimv2/modeling_aimv2.py +46 -45
  83. transformers/models/aimv2/modular_aimv2.py +13 -19
  84. transformers/models/albert/configuration_albert.py +8 -2
  85. transformers/models/albert/modeling_albert.py +70 -72
  86. transformers/models/albert/tokenization_albert.py +1 -4
  87. transformers/models/align/configuration_align.py +8 -6
  88. transformers/models/align/modeling_align.py +83 -86
  89. transformers/models/align/processing_align.py +2 -30
  90. transformers/models/altclip/configuration_altclip.py +4 -7
  91. transformers/models/altclip/modeling_altclip.py +106 -103
  92. transformers/models/altclip/processing_altclip.py +2 -15
  93. transformers/models/apertus/__init__.py +0 -1
  94. transformers/models/apertus/configuration_apertus.py +23 -28
  95. transformers/models/apertus/modeling_apertus.py +35 -38
  96. transformers/models/apertus/modular_apertus.py +36 -40
  97. transformers/models/arcee/configuration_arcee.py +25 -30
  98. transformers/models/arcee/modeling_arcee.py +35 -38
  99. transformers/models/arcee/modular_arcee.py +20 -23
  100. transformers/models/aria/configuration_aria.py +31 -44
  101. transformers/models/aria/image_processing_aria.py +25 -27
  102. transformers/models/aria/modeling_aria.py +102 -102
  103. transformers/models/aria/modular_aria.py +111 -124
  104. transformers/models/aria/processing_aria.py +28 -35
  105. transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py +0 -1
  106. transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py +3 -6
  107. transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +9 -11
  108. transformers/models/audioflamingo3/__init__.py +0 -1
  109. transformers/models/audioflamingo3/configuration_audioflamingo3.py +0 -1
  110. transformers/models/audioflamingo3/modeling_audioflamingo3.py +60 -52
  111. transformers/models/audioflamingo3/modular_audioflamingo3.py +52 -43
  112. transformers/models/audioflamingo3/processing_audioflamingo3.py +6 -8
  113. transformers/models/auto/auto_factory.py +12 -11
  114. transformers/models/auto/configuration_auto.py +48 -5
  115. transformers/models/auto/feature_extraction_auto.py +5 -7
  116. transformers/models/auto/image_processing_auto.py +30 -39
  117. transformers/models/auto/modeling_auto.py +33 -199
  118. transformers/models/auto/processing_auto.py +11 -19
  119. transformers/models/auto/tokenization_auto.py +38 -37
  120. transformers/models/auto/video_processing_auto.py +7 -8
  121. transformers/models/autoformer/configuration_autoformer.py +4 -7
  122. transformers/models/autoformer/modeling_autoformer.py +100 -101
  123. transformers/models/aya_vision/configuration_aya_vision.py +4 -1
  124. transformers/models/aya_vision/modeling_aya_vision.py +64 -99
  125. transformers/models/aya_vision/modular_aya_vision.py +46 -74
  126. transformers/models/aya_vision/processing_aya_vision.py +25 -53
  127. transformers/models/bamba/configuration_bamba.py +46 -39
  128. transformers/models/bamba/modeling_bamba.py +83 -119
  129. transformers/models/bamba/modular_bamba.py +70 -109
  130. transformers/models/bark/configuration_bark.py +6 -8
  131. transformers/models/bark/generation_configuration_bark.py +3 -5
  132. transformers/models/bark/modeling_bark.py +64 -65
  133. transformers/models/bark/processing_bark.py +19 -41
  134. transformers/models/bart/configuration_bart.py +9 -5
  135. transformers/models/bart/modeling_bart.py +124 -129
  136. transformers/models/barthez/tokenization_barthez.py +1 -4
  137. transformers/models/bartpho/tokenization_bartpho.py +6 -7
  138. transformers/models/beit/configuration_beit.py +2 -15
  139. transformers/models/beit/image_processing_beit.py +53 -56
  140. transformers/models/beit/image_processing_beit_fast.py +11 -12
  141. transformers/models/beit/modeling_beit.py +65 -62
  142. transformers/models/bert/configuration_bert.py +12 -2
  143. transformers/models/bert/modeling_bert.py +117 -152
  144. transformers/models/bert/tokenization_bert.py +2 -4
  145. transformers/models/bert/tokenization_bert_legacy.py +3 -5
  146. transformers/models/bert_generation/configuration_bert_generation.py +17 -2
  147. transformers/models/bert_generation/modeling_bert_generation.py +53 -55
  148. transformers/models/bert_generation/tokenization_bert_generation.py +2 -3
  149. transformers/models/bert_japanese/tokenization_bert_japanese.py +5 -6
  150. transformers/models/bertweet/tokenization_bertweet.py +1 -3
  151. transformers/models/big_bird/configuration_big_bird.py +12 -9
  152. transformers/models/big_bird/modeling_big_bird.py +107 -124
  153. transformers/models/big_bird/tokenization_big_bird.py +1 -4
  154. transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py +9 -9
  155. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +118 -118
  156. transformers/models/biogpt/configuration_biogpt.py +8 -2
  157. transformers/models/biogpt/modeling_biogpt.py +73 -79
  158. transformers/models/biogpt/modular_biogpt.py +60 -66
  159. transformers/models/biogpt/tokenization_biogpt.py +3 -5
  160. transformers/models/bit/configuration_bit.py +2 -5
  161. transformers/models/bit/image_processing_bit.py +21 -24
  162. transformers/models/bit/image_processing_bit_fast.py +0 -1
  163. transformers/models/bit/modeling_bit.py +15 -16
  164. transformers/models/bitnet/configuration_bitnet.py +23 -28
  165. transformers/models/bitnet/modeling_bitnet.py +34 -38
  166. transformers/models/bitnet/modular_bitnet.py +7 -10
  167. transformers/models/blenderbot/configuration_blenderbot.py +8 -5
  168. transformers/models/blenderbot/modeling_blenderbot.py +68 -99
  169. transformers/models/blenderbot/tokenization_blenderbot.py +0 -1
  170. transformers/models/blenderbot_small/configuration_blenderbot_small.py +8 -5
  171. transformers/models/blenderbot_small/modeling_blenderbot_small.py +70 -72
  172. transformers/models/blenderbot_small/tokenization_blenderbot_small.py +1 -3
  173. transformers/models/blip/configuration_blip.py +9 -10
  174. transformers/models/blip/image_processing_blip.py +17 -20
  175. transformers/models/blip/image_processing_blip_fast.py +0 -1
  176. transformers/models/blip/modeling_blip.py +115 -108
  177. transformers/models/blip/modeling_blip_text.py +63 -65
  178. transformers/models/blip/processing_blip.py +5 -36
  179. transformers/models/blip_2/configuration_blip_2.py +2 -2
  180. transformers/models/blip_2/modeling_blip_2.py +145 -121
  181. transformers/models/blip_2/processing_blip_2.py +8 -38
  182. transformers/models/bloom/configuration_bloom.py +5 -2
  183. transformers/models/bloom/modeling_bloom.py +60 -60
  184. transformers/models/blt/configuration_blt.py +94 -86
  185. transformers/models/blt/modeling_blt.py +93 -90
  186. transformers/models/blt/modular_blt.py +127 -69
  187. transformers/models/bridgetower/configuration_bridgetower.py +7 -2
  188. transformers/models/bridgetower/image_processing_bridgetower.py +34 -35
  189. transformers/models/bridgetower/image_processing_bridgetower_fast.py +13 -14
  190. transformers/models/bridgetower/modeling_bridgetower.py +136 -124
  191. transformers/models/bridgetower/processing_bridgetower.py +2 -16
  192. transformers/models/bros/configuration_bros.py +24 -18
  193. transformers/models/bros/modeling_bros.py +78 -80
  194. transformers/models/bros/processing_bros.py +2 -12
  195. transformers/models/byt5/tokenization_byt5.py +4 -6
  196. transformers/models/camembert/configuration_camembert.py +8 -2
  197. transformers/models/camembert/modeling_camembert.py +97 -99
  198. transformers/models/camembert/modular_camembert.py +51 -54
  199. transformers/models/camembert/tokenization_camembert.py +1 -4
  200. transformers/models/canine/configuration_canine.py +4 -2
  201. transformers/models/canine/modeling_canine.py +73 -75
  202. transformers/models/canine/tokenization_canine.py +0 -1
  203. transformers/models/chameleon/configuration_chameleon.py +29 -34
  204. transformers/models/chameleon/image_processing_chameleon.py +21 -24
  205. transformers/models/chameleon/image_processing_chameleon_fast.py +5 -6
  206. transformers/models/chameleon/modeling_chameleon.py +135 -92
  207. transformers/models/chameleon/processing_chameleon.py +16 -41
  208. transformers/models/chinese_clip/configuration_chinese_clip.py +10 -8
  209. transformers/models/chinese_clip/image_processing_chinese_clip.py +21 -24
  210. transformers/models/chinese_clip/image_processing_chinese_clip_fast.py +0 -1
  211. transformers/models/chinese_clip/modeling_chinese_clip.py +93 -95
  212. transformers/models/chinese_clip/processing_chinese_clip.py +2 -15
  213. transformers/models/clap/configuration_clap.py +4 -9
  214. transformers/models/clap/feature_extraction_clap.py +9 -10
  215. transformers/models/clap/modeling_clap.py +109 -111
  216. transformers/models/clap/processing_clap.py +2 -15
  217. transformers/models/clip/configuration_clip.py +4 -2
  218. transformers/models/clip/image_processing_clip.py +21 -24
  219. transformers/models/clip/image_processing_clip_fast.py +9 -1
  220. transformers/models/clip/modeling_clip.py +70 -68
  221. transformers/models/clip/processing_clip.py +2 -14
  222. transformers/models/clip/tokenization_clip.py +2 -5
  223. transformers/models/clipseg/configuration_clipseg.py +4 -2
  224. transformers/models/clipseg/modeling_clipseg.py +113 -112
  225. transformers/models/clipseg/processing_clipseg.py +19 -42
  226. transformers/models/clvp/configuration_clvp.py +15 -5
  227. transformers/models/clvp/feature_extraction_clvp.py +7 -10
  228. transformers/models/clvp/modeling_clvp.py +138 -145
  229. transformers/models/clvp/number_normalizer.py +1 -2
  230. transformers/models/clvp/processing_clvp.py +3 -20
  231. transformers/models/clvp/tokenization_clvp.py +0 -1
  232. transformers/models/code_llama/tokenization_code_llama.py +3 -6
  233. transformers/models/codegen/configuration_codegen.py +4 -4
  234. transformers/models/codegen/modeling_codegen.py +50 -49
  235. transformers/models/codegen/tokenization_codegen.py +5 -6
  236. transformers/models/cohere/configuration_cohere.py +25 -30
  237. transformers/models/cohere/modeling_cohere.py +39 -42
  238. transformers/models/cohere/modular_cohere.py +27 -31
  239. transformers/models/cohere/tokenization_cohere.py +5 -6
  240. transformers/models/cohere2/configuration_cohere2.py +27 -32
  241. transformers/models/cohere2/modeling_cohere2.py +38 -41
  242. transformers/models/cohere2/modular_cohere2.py +48 -52
  243. transformers/models/cohere2_vision/configuration_cohere2_vision.py +5 -1
  244. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +9 -10
  245. transformers/models/cohere2_vision/modeling_cohere2_vision.py +52 -55
  246. transformers/models/cohere2_vision/modular_cohere2_vision.py +41 -43
  247. transformers/models/cohere2_vision/processing_cohere2_vision.py +6 -36
  248. transformers/models/colpali/configuration_colpali.py +0 -1
  249. transformers/models/colpali/modeling_colpali.py +14 -16
  250. transformers/models/colpali/modular_colpali.py +11 -51
  251. transformers/models/colpali/processing_colpali.py +14 -52
  252. transformers/models/colqwen2/modeling_colqwen2.py +27 -28
  253. transformers/models/colqwen2/modular_colqwen2.py +36 -74
  254. transformers/models/colqwen2/processing_colqwen2.py +16 -52
  255. transformers/models/conditional_detr/configuration_conditional_detr.py +19 -47
  256. transformers/models/conditional_detr/image_processing_conditional_detr.py +67 -70
  257. transformers/models/conditional_detr/image_processing_conditional_detr_fast.py +50 -36
  258. transformers/models/conditional_detr/modeling_conditional_detr.py +851 -1001
  259. transformers/models/conditional_detr/modular_conditional_detr.py +901 -5
  260. transformers/models/convbert/configuration_convbert.py +11 -8
  261. transformers/models/convbert/modeling_convbert.py +85 -87
  262. transformers/models/convbert/tokenization_convbert.py +0 -1
  263. transformers/models/convnext/configuration_convnext.py +2 -5
  264. transformers/models/convnext/image_processing_convnext.py +18 -21
  265. transformers/models/convnext/image_processing_convnext_fast.py +7 -8
  266. transformers/models/convnext/modeling_convnext.py +12 -14
  267. transformers/models/convnextv2/configuration_convnextv2.py +2 -5
  268. transformers/models/convnextv2/modeling_convnextv2.py +12 -14
  269. transformers/models/cpm/tokenization_cpm.py +6 -7
  270. transformers/models/cpm/tokenization_cpm_fast.py +3 -5
  271. transformers/models/cpmant/configuration_cpmant.py +4 -1
  272. transformers/models/cpmant/modeling_cpmant.py +38 -40
  273. transformers/models/cpmant/tokenization_cpmant.py +1 -3
  274. transformers/models/csm/configuration_csm.py +58 -66
  275. transformers/models/csm/generation_csm.py +13 -14
  276. transformers/models/csm/modeling_csm.py +81 -84
  277. transformers/models/csm/modular_csm.py +56 -58
  278. transformers/models/csm/processing_csm.py +25 -68
  279. transformers/models/ctrl/configuration_ctrl.py +16 -1
  280. transformers/models/ctrl/modeling_ctrl.py +51 -66
  281. transformers/models/ctrl/tokenization_ctrl.py +0 -1
  282. transformers/models/cvt/configuration_cvt.py +0 -1
  283. transformers/models/cvt/modeling_cvt.py +13 -15
  284. transformers/models/cwm/__init__.py +0 -1
  285. transformers/models/cwm/configuration_cwm.py +8 -12
  286. transformers/models/cwm/modeling_cwm.py +36 -38
  287. transformers/models/cwm/modular_cwm.py +10 -12
  288. transformers/models/d_fine/configuration_d_fine.py +10 -57
  289. transformers/models/d_fine/modeling_d_fine.py +786 -927
  290. transformers/models/d_fine/modular_d_fine.py +339 -417
  291. transformers/models/dab_detr/configuration_dab_detr.py +22 -49
  292. transformers/models/dab_detr/modeling_dab_detr.py +79 -77
  293. transformers/models/dac/configuration_dac.py +0 -1
  294. transformers/models/dac/feature_extraction_dac.py +6 -9
  295. transformers/models/dac/modeling_dac.py +22 -24
  296. transformers/models/data2vec/configuration_data2vec_audio.py +4 -2
  297. transformers/models/data2vec/configuration_data2vec_text.py +11 -3
  298. transformers/models/data2vec/configuration_data2vec_vision.py +0 -1
  299. transformers/models/data2vec/modeling_data2vec_audio.py +55 -59
  300. transformers/models/data2vec/modeling_data2vec_text.py +97 -99
  301. transformers/models/data2vec/modeling_data2vec_vision.py +45 -44
  302. transformers/models/data2vec/modular_data2vec_audio.py +6 -1
  303. transformers/models/data2vec/modular_data2vec_text.py +51 -54
  304. transformers/models/dbrx/configuration_dbrx.py +29 -22
  305. transformers/models/dbrx/modeling_dbrx.py +45 -48
  306. transformers/models/dbrx/modular_dbrx.py +37 -39
  307. transformers/models/deberta/configuration_deberta.py +6 -1
  308. transformers/models/deberta/modeling_deberta.py +57 -60
  309. transformers/models/deberta/tokenization_deberta.py +2 -5
  310. transformers/models/deberta_v2/configuration_deberta_v2.py +6 -1
  311. transformers/models/deberta_v2/modeling_deberta_v2.py +63 -65
  312. transformers/models/deberta_v2/tokenization_deberta_v2.py +1 -4
  313. transformers/models/decision_transformer/configuration_decision_transformer.py +3 -2
  314. transformers/models/decision_transformer/modeling_decision_transformer.py +51 -53
  315. transformers/models/deepseek_v2/configuration_deepseek_v2.py +41 -47
  316. transformers/models/deepseek_v2/modeling_deepseek_v2.py +39 -41
  317. transformers/models/deepseek_v2/modular_deepseek_v2.py +48 -52
  318. transformers/models/deepseek_v3/configuration_deepseek_v3.py +42 -48
  319. transformers/models/deepseek_v3/modeling_deepseek_v3.py +38 -40
  320. transformers/models/deepseek_v3/modular_deepseek_v3.py +10 -10
  321. transformers/models/deepseek_vl/configuration_deepseek_vl.py +6 -3
  322. transformers/models/deepseek_vl/image_processing_deepseek_vl.py +27 -28
  323. transformers/models/deepseek_vl/image_processing_deepseek_vl_fast.py +12 -11
  324. transformers/models/deepseek_vl/modeling_deepseek_vl.py +48 -43
  325. transformers/models/deepseek_vl/modular_deepseek_vl.py +15 -43
  326. transformers/models/deepseek_vl/processing_deepseek_vl.py +10 -41
  327. transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py +7 -5
  328. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid.py +37 -37
  329. transformers/models/deepseek_vl_hybrid/image_processing_deepseek_vl_hybrid_fast.py +22 -22
  330. transformers/models/deepseek_vl_hybrid/modeling_deepseek_vl_hybrid.py +100 -56
  331. transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +141 -109
  332. transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py +12 -44
  333. transformers/models/deformable_detr/configuration_deformable_detr.py +22 -46
  334. transformers/models/deformable_detr/image_processing_deformable_detr.py +59 -61
  335. transformers/models/deformable_detr/image_processing_deformable_detr_fast.py +42 -28
  336. transformers/models/deformable_detr/modeling_deformable_detr.py +454 -652
  337. transformers/models/deformable_detr/modular_deformable_detr.py +1385 -5
  338. transformers/models/deit/configuration_deit.py +0 -1
  339. transformers/models/deit/image_processing_deit.py +18 -21
  340. transformers/models/deit/image_processing_deit_fast.py +0 -1
  341. transformers/models/deit/modeling_deit.py +27 -25
  342. transformers/models/depth_anything/configuration_depth_anything.py +12 -43
  343. transformers/models/depth_anything/modeling_depth_anything.py +10 -11
  344. transformers/models/depth_pro/configuration_depth_pro.py +0 -1
  345. transformers/models/depth_pro/image_processing_depth_pro.py +22 -23
  346. transformers/models/depth_pro/image_processing_depth_pro_fast.py +8 -9
  347. transformers/models/depth_pro/modeling_depth_pro.py +29 -27
  348. transformers/models/detr/configuration_detr.py +18 -50
  349. transformers/models/detr/image_processing_detr.py +64 -66
  350. transformers/models/detr/image_processing_detr_fast.py +33 -34
  351. transformers/models/detr/modeling_detr.py +748 -789
  352. transformers/models/dia/configuration_dia.py +9 -15
  353. transformers/models/dia/feature_extraction_dia.py +6 -9
  354. transformers/models/dia/generation_dia.py +48 -53
  355. transformers/models/dia/modeling_dia.py +68 -71
  356. transformers/models/dia/modular_dia.py +56 -58
  357. transformers/models/dia/processing_dia.py +39 -29
  358. transformers/models/dia/tokenization_dia.py +3 -6
  359. transformers/models/diffllama/configuration_diffllama.py +25 -30
  360. transformers/models/diffllama/modeling_diffllama.py +45 -53
  361. transformers/models/diffllama/modular_diffllama.py +18 -25
  362. transformers/models/dinat/configuration_dinat.py +2 -5
  363. transformers/models/dinat/modeling_dinat.py +47 -48
  364. transformers/models/dinov2/configuration_dinov2.py +2 -5
  365. transformers/models/dinov2/modeling_dinov2.py +20 -21
  366. transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py +3 -5
  367. transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py +21 -21
  368. transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py +11 -14
  369. transformers/models/dinov3_convnext/configuration_dinov3_convnext.py +6 -11
  370. transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +5 -9
  371. transformers/models/dinov3_vit/configuration_dinov3_vit.py +7 -12
  372. transformers/models/dinov3_vit/image_processing_dinov3_vit_fast.py +7 -8
  373. transformers/models/dinov3_vit/modeling_dinov3_vit.py +19 -22
  374. transformers/models/dinov3_vit/modular_dinov3_vit.py +16 -19
  375. transformers/models/distilbert/configuration_distilbert.py +8 -2
  376. transformers/models/distilbert/modeling_distilbert.py +47 -49
  377. transformers/models/distilbert/tokenization_distilbert.py +0 -1
  378. transformers/models/doge/__init__.py +0 -1
  379. transformers/models/doge/configuration_doge.py +42 -35
  380. transformers/models/doge/modeling_doge.py +46 -49
  381. transformers/models/doge/modular_doge.py +77 -68
  382. transformers/models/donut/configuration_donut_swin.py +0 -1
  383. transformers/models/donut/image_processing_donut.py +26 -29
  384. transformers/models/donut/image_processing_donut_fast.py +9 -14
  385. transformers/models/donut/modeling_donut_swin.py +44 -46
  386. transformers/models/donut/processing_donut.py +5 -26
  387. transformers/models/dots1/configuration_dots1.py +43 -36
  388. transformers/models/dots1/modeling_dots1.py +35 -38
  389. transformers/models/dots1/modular_dots1.py +0 -1
  390. transformers/models/dpr/configuration_dpr.py +19 -2
  391. transformers/models/dpr/modeling_dpr.py +37 -39
  392. transformers/models/dpr/tokenization_dpr.py +7 -9
  393. transformers/models/dpr/tokenization_dpr_fast.py +7 -9
  394. transformers/models/dpt/configuration_dpt.py +23 -66
  395. transformers/models/dpt/image_processing_dpt.py +65 -66
  396. transformers/models/dpt/image_processing_dpt_fast.py +18 -19
  397. transformers/models/dpt/modeling_dpt.py +38 -36
  398. transformers/models/dpt/modular_dpt.py +14 -15
  399. transformers/models/edgetam/configuration_edgetam.py +1 -2
  400. transformers/models/edgetam/modeling_edgetam.py +87 -89
  401. transformers/models/edgetam/modular_edgetam.py +7 -13
  402. transformers/models/edgetam_video/__init__.py +0 -1
  403. transformers/models/edgetam_video/configuration_edgetam_video.py +0 -1
  404. transformers/models/edgetam_video/modeling_edgetam_video.py +126 -128
  405. transformers/models/edgetam_video/modular_edgetam_video.py +25 -27
  406. transformers/models/efficientloftr/configuration_efficientloftr.py +4 -5
  407. transformers/models/efficientloftr/image_processing_efficientloftr.py +14 -16
  408. transformers/models/efficientloftr/image_processing_efficientloftr_fast.py +8 -7
  409. transformers/models/efficientloftr/modeling_efficientloftr.py +46 -38
  410. transformers/models/efficientloftr/modular_efficientloftr.py +1 -3
  411. transformers/models/efficientnet/configuration_efficientnet.py +0 -1
  412. transformers/models/efficientnet/image_processing_efficientnet.py +23 -26
  413. transformers/models/efficientnet/image_processing_efficientnet_fast.py +16 -17
  414. transformers/models/efficientnet/modeling_efficientnet.py +12 -14
  415. transformers/models/electra/configuration_electra.py +13 -3
  416. transformers/models/electra/modeling_electra.py +107 -109
  417. transformers/models/emu3/configuration_emu3.py +17 -17
  418. transformers/models/emu3/image_processing_emu3.py +44 -39
  419. transformers/models/emu3/modeling_emu3.py +143 -109
  420. transformers/models/emu3/modular_emu3.py +109 -73
  421. transformers/models/emu3/processing_emu3.py +18 -43
  422. transformers/models/encodec/configuration_encodec.py +2 -4
  423. transformers/models/encodec/feature_extraction_encodec.py +10 -13
  424. transformers/models/encodec/modeling_encodec.py +25 -29
  425. transformers/models/encoder_decoder/configuration_encoder_decoder.py +12 -2
  426. transformers/models/encoder_decoder/modeling_encoder_decoder.py +37 -43
  427. transformers/models/eomt/configuration_eomt.py +12 -14
  428. transformers/models/eomt/image_processing_eomt.py +53 -55
  429. transformers/models/eomt/image_processing_eomt_fast.py +18 -19
  430. transformers/models/eomt/modeling_eomt.py +19 -21
  431. transformers/models/eomt/modular_eomt.py +28 -30
  432. transformers/models/eomt_dinov3/__init__.py +28 -0
  433. transformers/models/eomt_dinov3/configuration_eomt_dinov3.py +204 -0
  434. transformers/models/eomt_dinov3/modeling_eomt_dinov3.py +1376 -0
  435. transformers/models/eomt_dinov3/modular_eomt_dinov3.py +454 -0
  436. transformers/models/ernie/configuration_ernie.py +24 -3
  437. transformers/models/ernie/modeling_ernie.py +127 -162
  438. transformers/models/ernie/modular_ernie.py +91 -103
  439. transformers/models/ernie4_5/configuration_ernie4_5.py +23 -27
  440. transformers/models/ernie4_5/modeling_ernie4_5.py +35 -37
  441. transformers/models/ernie4_5/modular_ernie4_5.py +1 -3
  442. transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +34 -39
  443. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +40 -42
  444. transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +7 -9
  445. transformers/models/ernie4_5_vl_moe/configuration_ernie4_5_vl_moe.py +17 -7
  446. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe.py +34 -35
  447. transformers/models/ernie4_5_vl_moe/image_processing_ernie4_5_vl_moe_fast.py +6 -7
  448. transformers/models/ernie4_5_vl_moe/modeling_ernie4_5_vl_moe.py +305 -267
  449. transformers/models/ernie4_5_vl_moe/modular_ernie4_5_vl_moe.py +163 -142
  450. transformers/models/ernie4_5_vl_moe/processing_ernie4_5_vl_moe.py +3 -5
  451. transformers/models/ernie4_5_vl_moe/video_processing_ernie4_5_vl_moe.py +17 -18
  452. transformers/models/esm/configuration_esm.py +11 -15
  453. transformers/models/esm/modeling_esm.py +35 -37
  454. transformers/models/esm/modeling_esmfold.py +43 -50
  455. transformers/models/esm/openfold_utils/chunk_utils.py +6 -6
  456. transformers/models/esm/openfold_utils/loss.py +1 -2
  457. transformers/models/esm/openfold_utils/protein.py +15 -16
  458. transformers/models/esm/openfold_utils/tensor_utils.py +6 -6
  459. transformers/models/esm/tokenization_esm.py +2 -4
  460. transformers/models/evolla/configuration_evolla.py +50 -40
  461. transformers/models/evolla/modeling_evolla.py +69 -68
  462. transformers/models/evolla/modular_evolla.py +50 -48
  463. transformers/models/evolla/processing_evolla.py +23 -35
  464. transformers/models/exaone4/configuration_exaone4.py +27 -27
  465. transformers/models/exaone4/modeling_exaone4.py +36 -39
  466. transformers/models/exaone4/modular_exaone4.py +51 -50
  467. transformers/models/exaone_moe/__init__.py +27 -0
  468. transformers/models/exaone_moe/configuration_exaone_moe.py +235 -0
  469. transformers/models/exaone_moe/modeling_exaone_moe.py +665 -0
  470. transformers/models/exaone_moe/modular_exaone_moe.py +373 -0
  471. transformers/models/falcon/configuration_falcon.py +31 -26
  472. transformers/models/falcon/modeling_falcon.py +76 -84
  473. transformers/models/falcon_h1/configuration_falcon_h1.py +57 -51
  474. transformers/models/falcon_h1/modeling_falcon_h1.py +74 -109
  475. transformers/models/falcon_h1/modular_falcon_h1.py +68 -100
  476. transformers/models/falcon_mamba/configuration_falcon_mamba.py +5 -2
  477. transformers/models/falcon_mamba/modeling_falcon_mamba.py +64 -73
  478. transformers/models/falcon_mamba/modular_falcon_mamba.py +14 -13
  479. transformers/models/fast_vlm/configuration_fast_vlm.py +10 -0
  480. transformers/models/fast_vlm/modeling_fast_vlm.py +70 -97
  481. transformers/models/fast_vlm/modular_fast_vlm.py +148 -38
  482. transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py +2 -6
  483. transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +45 -47
  484. transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +1 -3
  485. transformers/models/flaubert/configuration_flaubert.py +10 -5
  486. transformers/models/flaubert/modeling_flaubert.py +125 -129
  487. transformers/models/flaubert/tokenization_flaubert.py +3 -5
  488. transformers/models/flava/configuration_flava.py +9 -9
  489. transformers/models/flava/image_processing_flava.py +66 -67
  490. transformers/models/flava/image_processing_flava_fast.py +46 -47
  491. transformers/models/flava/modeling_flava.py +144 -135
  492. transformers/models/flava/processing_flava.py +2 -12
  493. transformers/models/flex_olmo/__init__.py +0 -1
  494. transformers/models/flex_olmo/configuration_flex_olmo.py +34 -39
  495. transformers/models/flex_olmo/modeling_flex_olmo.py +41 -43
  496. transformers/models/flex_olmo/modular_flex_olmo.py +46 -51
  497. transformers/models/florence2/configuration_florence2.py +4 -1
  498. transformers/models/florence2/modeling_florence2.py +96 -72
  499. transformers/models/florence2/modular_florence2.py +100 -107
  500. transformers/models/florence2/processing_florence2.py +18 -47
  501. transformers/models/fnet/configuration_fnet.py +6 -2
  502. transformers/models/fnet/modeling_fnet.py +69 -80
  503. transformers/models/fnet/tokenization_fnet.py +0 -1
  504. transformers/models/focalnet/configuration_focalnet.py +2 -5
  505. transformers/models/focalnet/modeling_focalnet.py +49 -48
  506. transformers/models/fsmt/configuration_fsmt.py +12 -17
  507. transformers/models/fsmt/modeling_fsmt.py +47 -48
  508. transformers/models/fsmt/tokenization_fsmt.py +3 -5
  509. transformers/models/funnel/configuration_funnel.py +8 -1
  510. transformers/models/funnel/modeling_funnel.py +91 -93
  511. transformers/models/funnel/tokenization_funnel.py +2 -5
  512. transformers/models/fuyu/configuration_fuyu.py +28 -34
  513. transformers/models/fuyu/image_processing_fuyu.py +29 -31
  514. transformers/models/fuyu/image_processing_fuyu_fast.py +17 -17
  515. transformers/models/fuyu/modeling_fuyu.py +50 -52
  516. transformers/models/fuyu/processing_fuyu.py +9 -36
  517. transformers/models/gemma/configuration_gemma.py +25 -30
  518. transformers/models/gemma/modeling_gemma.py +36 -38
  519. transformers/models/gemma/modular_gemma.py +33 -36
  520. transformers/models/gemma/tokenization_gemma.py +3 -6
  521. transformers/models/gemma2/configuration_gemma2.py +30 -35
  522. transformers/models/gemma2/modeling_gemma2.py +38 -41
  523. transformers/models/gemma2/modular_gemma2.py +63 -67
  524. transformers/models/gemma3/configuration_gemma3.py +53 -48
  525. transformers/models/gemma3/image_processing_gemma3.py +29 -31
  526. transformers/models/gemma3/image_processing_gemma3_fast.py +11 -12
  527. transformers/models/gemma3/modeling_gemma3.py +123 -122
  528. transformers/models/gemma3/modular_gemma3.py +128 -125
  529. transformers/models/gemma3/processing_gemma3.py +5 -5
  530. transformers/models/gemma3n/configuration_gemma3n.py +42 -30
  531. transformers/models/gemma3n/feature_extraction_gemma3n.py +9 -11
  532. transformers/models/gemma3n/modeling_gemma3n.py +166 -147
  533. transformers/models/gemma3n/modular_gemma3n.py +176 -148
  534. transformers/models/gemma3n/processing_gemma3n.py +12 -26
  535. transformers/models/git/configuration_git.py +5 -8
  536. transformers/models/git/modeling_git.py +115 -127
  537. transformers/models/git/processing_git.py +2 -14
  538. transformers/models/glm/configuration_glm.py +26 -30
  539. transformers/models/glm/modeling_glm.py +36 -39
  540. transformers/models/glm/modular_glm.py +4 -7
  541. transformers/models/glm4/configuration_glm4.py +26 -30
  542. transformers/models/glm4/modeling_glm4.py +39 -41
  543. transformers/models/glm4/modular_glm4.py +8 -10
  544. transformers/models/glm46v/configuration_glm46v.py +4 -1
  545. transformers/models/glm46v/image_processing_glm46v.py +40 -38
  546. transformers/models/glm46v/image_processing_glm46v_fast.py +9 -9
  547. transformers/models/glm46v/modeling_glm46v.py +138 -93
  548. transformers/models/glm46v/modular_glm46v.py +5 -3
  549. transformers/models/glm46v/processing_glm46v.py +7 -41
  550. transformers/models/glm46v/video_processing_glm46v.py +9 -11
  551. transformers/models/glm4_moe/configuration_glm4_moe.py +42 -35
  552. transformers/models/glm4_moe/modeling_glm4_moe.py +36 -39
  553. transformers/models/glm4_moe/modular_glm4_moe.py +43 -36
  554. transformers/models/glm4_moe_lite/__init__.py +28 -0
  555. transformers/models/glm4_moe_lite/configuration_glm4_moe_lite.py +233 -0
  556. transformers/models/glm4_moe_lite/modeling_glm4_moe_lite.py +740 -0
  557. transformers/models/glm4_moe_lite/modular_glm4_moe_lite.py +302 -0
  558. transformers/models/glm4v/configuration_glm4v.py +25 -24
  559. transformers/models/glm4v/image_processing_glm4v.py +39 -38
  560. transformers/models/glm4v/image_processing_glm4v_fast.py +8 -9
  561. transformers/models/glm4v/modeling_glm4v.py +249 -210
  562. transformers/models/glm4v/modular_glm4v.py +211 -230
  563. transformers/models/glm4v/processing_glm4v.py +7 -41
  564. transformers/models/glm4v/video_processing_glm4v.py +9 -11
  565. transformers/models/glm4v_moe/configuration_glm4v_moe.py +136 -127
  566. transformers/models/glm4v_moe/modeling_glm4v_moe.py +348 -356
  567. transformers/models/glm4v_moe/modular_glm4v_moe.py +76 -174
  568. transformers/models/glm_image/__init__.py +31 -0
  569. transformers/models/glm_image/configuration_glm_image.py +358 -0
  570. transformers/models/glm_image/image_processing_glm_image.py +503 -0
  571. transformers/models/glm_image/image_processing_glm_image_fast.py +294 -0
  572. transformers/models/glm_image/modeling_glm_image.py +1691 -0
  573. transformers/models/glm_image/modular_glm_image.py +1640 -0
  574. transformers/models/glm_image/processing_glm_image.py +265 -0
  575. transformers/models/glm_ocr/__init__.py +28 -0
  576. transformers/models/glm_ocr/configuration_glm_ocr.py +312 -0
  577. transformers/models/glm_ocr/modeling_glm_ocr.py +1633 -0
  578. transformers/models/glm_ocr/modular_glm_ocr.py +428 -0
  579. transformers/models/glmasr/__init__.py +0 -1
  580. transformers/models/glmasr/configuration_glmasr.py +0 -1
  581. transformers/models/glmasr/modeling_glmasr.py +51 -46
  582. transformers/models/glmasr/modular_glmasr.py +39 -29
  583. transformers/models/glmasr/processing_glmasr.py +7 -8
  584. transformers/models/glpn/configuration_glpn.py +0 -1
  585. transformers/models/glpn/image_processing_glpn.py +11 -12
  586. transformers/models/glpn/image_processing_glpn_fast.py +11 -12
  587. transformers/models/glpn/modeling_glpn.py +14 -14
  588. transformers/models/got_ocr2/configuration_got_ocr2.py +10 -13
  589. transformers/models/got_ocr2/image_processing_got_ocr2.py +22 -24
  590. transformers/models/got_ocr2/image_processing_got_ocr2_fast.py +9 -10
  591. transformers/models/got_ocr2/modeling_got_ocr2.py +69 -77
  592. transformers/models/got_ocr2/modular_got_ocr2.py +60 -52
  593. transformers/models/got_ocr2/processing_got_ocr2.py +42 -63
  594. transformers/models/gpt2/configuration_gpt2.py +13 -2
  595. transformers/models/gpt2/modeling_gpt2.py +111 -113
  596. transformers/models/gpt2/tokenization_gpt2.py +6 -9
  597. transformers/models/gpt_bigcode/configuration_gpt_bigcode.py +7 -2
  598. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +78 -84
  599. transformers/models/gpt_neo/configuration_gpt_neo.py +9 -2
  600. transformers/models/gpt_neo/modeling_gpt_neo.py +66 -71
  601. transformers/models/gpt_neox/configuration_gpt_neox.py +27 -25
  602. transformers/models/gpt_neox/modeling_gpt_neox.py +74 -76
  603. transformers/models/gpt_neox/modular_gpt_neox.py +68 -70
  604. transformers/models/gpt_neox/tokenization_gpt_neox.py +2 -5
  605. transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +24 -19
  606. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +43 -46
  607. transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py +1 -3
  608. transformers/models/gpt_oss/configuration_gpt_oss.py +31 -30
  609. transformers/models/gpt_oss/modeling_gpt_oss.py +80 -114
  610. transformers/models/gpt_oss/modular_gpt_oss.py +62 -97
  611. transformers/models/gpt_sw3/tokenization_gpt_sw3.py +4 -4
  612. transformers/models/gptj/configuration_gptj.py +4 -5
  613. transformers/models/gptj/modeling_gptj.py +85 -88
  614. transformers/models/granite/configuration_granite.py +28 -33
  615. transformers/models/granite/modeling_granite.py +43 -45
  616. transformers/models/granite/modular_granite.py +29 -31
  617. transformers/models/granite_speech/configuration_granite_speech.py +0 -1
  618. transformers/models/granite_speech/feature_extraction_granite_speech.py +1 -3
  619. transformers/models/granite_speech/modeling_granite_speech.py +84 -60
  620. transformers/models/granite_speech/processing_granite_speech.py +11 -4
  621. transformers/models/granitemoe/configuration_granitemoe.py +31 -36
  622. transformers/models/granitemoe/modeling_granitemoe.py +39 -41
  623. transformers/models/granitemoe/modular_granitemoe.py +21 -23
  624. transformers/models/granitemoehybrid/__init__.py +0 -1
  625. transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +55 -48
  626. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +82 -118
  627. transformers/models/granitemoehybrid/modular_granitemoehybrid.py +57 -65
  628. transformers/models/granitemoeshared/configuration_granitemoeshared.py +33 -37
  629. transformers/models/granitemoeshared/modeling_granitemoeshared.py +52 -56
  630. transformers/models/granitemoeshared/modular_granitemoeshared.py +19 -21
  631. transformers/models/grounding_dino/configuration_grounding_dino.py +10 -46
  632. transformers/models/grounding_dino/image_processing_grounding_dino.py +60 -62
  633. transformers/models/grounding_dino/image_processing_grounding_dino_fast.py +28 -29
  634. transformers/models/grounding_dino/modeling_grounding_dino.py +161 -181
  635. transformers/models/grounding_dino/modular_grounding_dino.py +2 -3
  636. transformers/models/grounding_dino/processing_grounding_dino.py +10 -38
  637. transformers/models/groupvit/configuration_groupvit.py +4 -2
  638. transformers/models/groupvit/modeling_groupvit.py +98 -92
  639. transformers/models/helium/configuration_helium.py +25 -29
  640. transformers/models/helium/modeling_helium.py +37 -40
  641. transformers/models/helium/modular_helium.py +3 -7
  642. transformers/models/herbert/tokenization_herbert.py +4 -6
  643. transformers/models/hgnet_v2/configuration_hgnet_v2.py +2 -5
  644. transformers/models/hgnet_v2/modeling_hgnet_v2.py +12 -14
  645. transformers/models/hgnet_v2/modular_hgnet_v2.py +13 -17
  646. transformers/models/hiera/configuration_hiera.py +2 -5
  647. transformers/models/hiera/modeling_hiera.py +71 -70
  648. transformers/models/hubert/configuration_hubert.py +4 -2
  649. transformers/models/hubert/modeling_hubert.py +42 -41
  650. transformers/models/hubert/modular_hubert.py +8 -11
  651. transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +26 -31
  652. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +58 -37
  653. transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +31 -11
  654. transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +31 -36
  655. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +54 -44
  656. transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +27 -15
  657. transformers/models/ibert/configuration_ibert.py +4 -2
  658. transformers/models/ibert/modeling_ibert.py +60 -62
  659. transformers/models/ibert/quant_modules.py +0 -1
  660. transformers/models/idefics/configuration_idefics.py +5 -8
  661. transformers/models/idefics/image_processing_idefics.py +13 -15
  662. transformers/models/idefics/modeling_idefics.py +63 -65
  663. transformers/models/idefics/perceiver.py +1 -3
  664. transformers/models/idefics/processing_idefics.py +32 -48
  665. transformers/models/idefics/vision.py +27 -28
  666. transformers/models/idefics2/configuration_idefics2.py +1 -3
  667. transformers/models/idefics2/image_processing_idefics2.py +31 -32
  668. transformers/models/idefics2/image_processing_idefics2_fast.py +8 -8
  669. transformers/models/idefics2/modeling_idefics2.py +126 -106
  670. transformers/models/idefics2/processing_idefics2.py +10 -68
  671. transformers/models/idefics3/configuration_idefics3.py +1 -4
  672. transformers/models/idefics3/image_processing_idefics3.py +42 -43
  673. transformers/models/idefics3/image_processing_idefics3_fast.py +40 -15
  674. transformers/models/idefics3/modeling_idefics3.py +113 -92
  675. transformers/models/idefics3/processing_idefics3.py +15 -69
  676. transformers/models/ijepa/configuration_ijepa.py +0 -1
  677. transformers/models/ijepa/modeling_ijepa.py +13 -14
  678. transformers/models/ijepa/modular_ijepa.py +5 -7
  679. transformers/models/imagegpt/configuration_imagegpt.py +9 -2
  680. transformers/models/imagegpt/image_processing_imagegpt.py +17 -18
  681. transformers/models/imagegpt/image_processing_imagegpt_fast.py +10 -11
  682. transformers/models/imagegpt/modeling_imagegpt.py +65 -62
  683. transformers/models/informer/configuration_informer.py +6 -9
  684. transformers/models/informer/modeling_informer.py +87 -89
  685. transformers/models/informer/modular_informer.py +13 -16
  686. transformers/models/instructblip/configuration_instructblip.py +2 -2
  687. transformers/models/instructblip/modeling_instructblip.py +104 -79
  688. transformers/models/instructblip/processing_instructblip.py +10 -36
  689. transformers/models/instructblipvideo/configuration_instructblipvideo.py +2 -2
  690. transformers/models/instructblipvideo/modeling_instructblipvideo.py +108 -105
  691. transformers/models/instructblipvideo/modular_instructblipvideo.py +73 -64
  692. transformers/models/instructblipvideo/processing_instructblipvideo.py +14 -33
  693. transformers/models/instructblipvideo/video_processing_instructblipvideo.py +6 -7
  694. transformers/models/internvl/configuration_internvl.py +5 -1
  695. transformers/models/internvl/modeling_internvl.py +76 -98
  696. transformers/models/internvl/modular_internvl.py +45 -59
  697. transformers/models/internvl/processing_internvl.py +12 -45
  698. transformers/models/internvl/video_processing_internvl.py +10 -11
  699. transformers/models/jais2/configuration_jais2.py +25 -29
  700. transformers/models/jais2/modeling_jais2.py +36 -38
  701. transformers/models/jais2/modular_jais2.py +20 -22
  702. transformers/models/jamba/configuration_jamba.py +5 -8
  703. transformers/models/jamba/modeling_jamba.py +47 -50
  704. transformers/models/jamba/modular_jamba.py +40 -41
  705. transformers/models/janus/configuration_janus.py +0 -1
  706. transformers/models/janus/image_processing_janus.py +37 -39
  707. transformers/models/janus/image_processing_janus_fast.py +20 -21
  708. transformers/models/janus/modeling_janus.py +103 -188
  709. transformers/models/janus/modular_janus.py +122 -83
  710. transformers/models/janus/processing_janus.py +17 -43
  711. transformers/models/jetmoe/configuration_jetmoe.py +26 -27
  712. transformers/models/jetmoe/modeling_jetmoe.py +42 -45
  713. transformers/models/jetmoe/modular_jetmoe.py +33 -36
  714. transformers/models/kosmos2/configuration_kosmos2.py +10 -9
  715. transformers/models/kosmos2/modeling_kosmos2.py +199 -178
  716. transformers/models/kosmos2/processing_kosmos2.py +40 -55
  717. transformers/models/kosmos2_5/__init__.py +0 -1
  718. transformers/models/kosmos2_5/configuration_kosmos2_5.py +8 -9
  719. transformers/models/kosmos2_5/image_processing_kosmos2_5.py +10 -12
  720. transformers/models/kosmos2_5/image_processing_kosmos2_5_fast.py +2 -11
  721. transformers/models/kosmos2_5/modeling_kosmos2_5.py +162 -172
  722. transformers/models/kosmos2_5/processing_kosmos2_5.py +8 -29
  723. transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +31 -28
  724. transformers/models/kyutai_speech_to_text/feature_extraction_kyutai_speech_to_text.py +12 -14
  725. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +103 -106
  726. transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py +20 -22
  727. transformers/models/kyutai_speech_to_text/processing_kyutai_speech_to_text.py +2 -8
  728. transformers/models/lasr/configuration_lasr.py +3 -7
  729. transformers/models/lasr/feature_extraction_lasr.py +10 -12
  730. transformers/models/lasr/modeling_lasr.py +21 -24
  731. transformers/models/lasr/modular_lasr.py +11 -13
  732. transformers/models/lasr/processing_lasr.py +12 -6
  733. transformers/models/lasr/tokenization_lasr.py +2 -4
  734. transformers/models/layoutlm/configuration_layoutlm.py +14 -2
  735. transformers/models/layoutlm/modeling_layoutlm.py +70 -72
  736. transformers/models/layoutlmv2/configuration_layoutlmv2.py +14 -17
  737. transformers/models/layoutlmv2/image_processing_layoutlmv2.py +18 -21
  738. transformers/models/layoutlmv2/image_processing_layoutlmv2_fast.py +7 -8
  739. transformers/models/layoutlmv2/modeling_layoutlmv2.py +48 -50
  740. transformers/models/layoutlmv2/processing_layoutlmv2.py +14 -44
  741. transformers/models/layoutlmv2/tokenization_layoutlmv2.py +63 -74
  742. transformers/models/layoutlmv3/configuration_layoutlmv3.py +16 -19
  743. transformers/models/layoutlmv3/image_processing_layoutlmv3.py +24 -26
  744. transformers/models/layoutlmv3/image_processing_layoutlmv3_fast.py +9 -10
  745. transformers/models/layoutlmv3/modeling_layoutlmv3.py +49 -51
  746. transformers/models/layoutlmv3/processing_layoutlmv3.py +14 -46
  747. transformers/models/layoutlmv3/tokenization_layoutlmv3.py +64 -75
  748. transformers/models/layoutxlm/configuration_layoutxlm.py +14 -17
  749. transformers/models/layoutxlm/modular_layoutxlm.py +0 -1
  750. transformers/models/layoutxlm/processing_layoutxlm.py +14 -44
  751. transformers/models/layoutxlm/tokenization_layoutxlm.py +65 -76
  752. transformers/models/led/configuration_led.py +8 -12
  753. transformers/models/led/modeling_led.py +113 -267
  754. transformers/models/levit/configuration_levit.py +0 -1
  755. transformers/models/levit/image_processing_levit.py +19 -21
  756. transformers/models/levit/image_processing_levit_fast.py +4 -5
  757. transformers/models/levit/modeling_levit.py +17 -19
  758. transformers/models/lfm2/configuration_lfm2.py +27 -30
  759. transformers/models/lfm2/modeling_lfm2.py +46 -48
  760. transformers/models/lfm2/modular_lfm2.py +32 -32
  761. transformers/models/lfm2_moe/__init__.py +0 -1
  762. transformers/models/lfm2_moe/configuration_lfm2_moe.py +6 -9
  763. transformers/models/lfm2_moe/modeling_lfm2_moe.py +48 -49
  764. transformers/models/lfm2_moe/modular_lfm2_moe.py +8 -9
  765. transformers/models/lfm2_vl/configuration_lfm2_vl.py +4 -1
  766. transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py +43 -20
  767. transformers/models/lfm2_vl/modeling_lfm2_vl.py +73 -61
  768. transformers/models/lfm2_vl/modular_lfm2_vl.py +66 -54
  769. transformers/models/lfm2_vl/processing_lfm2_vl.py +14 -34
  770. transformers/models/lightglue/image_processing_lightglue.py +16 -15
  771. transformers/models/lightglue/image_processing_lightglue_fast.py +8 -7
  772. transformers/models/lightglue/modeling_lightglue.py +31 -33
  773. transformers/models/lightglue/modular_lightglue.py +31 -31
  774. transformers/models/lighton_ocr/__init__.py +28 -0
  775. transformers/models/lighton_ocr/configuration_lighton_ocr.py +128 -0
  776. transformers/models/lighton_ocr/modeling_lighton_ocr.py +463 -0
  777. transformers/models/lighton_ocr/modular_lighton_ocr.py +404 -0
  778. transformers/models/lighton_ocr/processing_lighton_ocr.py +229 -0
  779. transformers/models/lilt/configuration_lilt.py +6 -2
  780. transformers/models/lilt/modeling_lilt.py +53 -55
  781. transformers/models/llama/configuration_llama.py +26 -31
  782. transformers/models/llama/modeling_llama.py +35 -38
  783. transformers/models/llama/tokenization_llama.py +2 -4
  784. transformers/models/llama4/configuration_llama4.py +87 -69
  785. transformers/models/llama4/image_processing_llama4_fast.py +11 -12
  786. transformers/models/llama4/modeling_llama4.py +116 -115
  787. transformers/models/llama4/processing_llama4.py +33 -57
  788. transformers/models/llava/configuration_llava.py +10 -1
  789. transformers/models/llava/image_processing_llava.py +25 -28
  790. transformers/models/llava/image_processing_llava_fast.py +9 -10
  791. transformers/models/llava/modeling_llava.py +73 -102
  792. transformers/models/llava/processing_llava.py +18 -51
  793. transformers/models/llava_next/configuration_llava_next.py +2 -2
  794. transformers/models/llava_next/image_processing_llava_next.py +43 -45
  795. transformers/models/llava_next/image_processing_llava_next_fast.py +11 -12
  796. transformers/models/llava_next/modeling_llava_next.py +103 -104
  797. transformers/models/llava_next/processing_llava_next.py +18 -47
  798. transformers/models/llava_next_video/configuration_llava_next_video.py +10 -7
  799. transformers/models/llava_next_video/modeling_llava_next_video.py +168 -155
  800. transformers/models/llava_next_video/modular_llava_next_video.py +154 -147
  801. transformers/models/llava_next_video/processing_llava_next_video.py +21 -63
  802. transformers/models/llava_next_video/video_processing_llava_next_video.py +0 -1
  803. transformers/models/llava_onevision/configuration_llava_onevision.py +10 -7
  804. transformers/models/llava_onevision/image_processing_llava_onevision.py +40 -42
  805. transformers/models/llava_onevision/image_processing_llava_onevision_fast.py +14 -14
  806. transformers/models/llava_onevision/modeling_llava_onevision.py +170 -166
  807. transformers/models/llava_onevision/modular_llava_onevision.py +156 -152
  808. transformers/models/llava_onevision/processing_llava_onevision.py +21 -53
  809. transformers/models/llava_onevision/video_processing_llava_onevision.py +0 -1
  810. transformers/models/longcat_flash/__init__.py +0 -1
  811. transformers/models/longcat_flash/configuration_longcat_flash.py +39 -45
  812. transformers/models/longcat_flash/modeling_longcat_flash.py +37 -38
  813. transformers/models/longcat_flash/modular_longcat_flash.py +23 -24
  814. transformers/models/longformer/configuration_longformer.py +5 -5
  815. transformers/models/longformer/modeling_longformer.py +99 -101
  816. transformers/models/longt5/configuration_longt5.py +9 -7
  817. transformers/models/longt5/modeling_longt5.py +45 -45
  818. transformers/models/luke/configuration_luke.py +8 -2
  819. transformers/models/luke/modeling_luke.py +179 -181
  820. transformers/models/luke/tokenization_luke.py +99 -105
  821. transformers/{pipelines/deprecated → models/lw_detr}/__init__.py +14 -3
  822. transformers/models/lw_detr/configuration_lw_detr.py +362 -0
  823. transformers/models/lw_detr/modeling_lw_detr.py +1697 -0
  824. transformers/models/lw_detr/modular_lw_detr.py +1609 -0
  825. transformers/models/lxmert/configuration_lxmert.py +16 -1
  826. transformers/models/lxmert/modeling_lxmert.py +63 -74
  827. transformers/models/m2m_100/configuration_m2m_100.py +7 -9
  828. transformers/models/m2m_100/modeling_m2m_100.py +72 -74
  829. transformers/models/m2m_100/tokenization_m2m_100.py +8 -8
  830. transformers/models/mamba/configuration_mamba.py +5 -3
  831. transformers/models/mamba/modeling_mamba.py +61 -70
  832. transformers/models/mamba2/configuration_mamba2.py +5 -8
  833. transformers/models/mamba2/modeling_mamba2.py +66 -79
  834. transformers/models/marian/configuration_marian.py +10 -5
  835. transformers/models/marian/modeling_marian.py +88 -90
  836. transformers/models/marian/tokenization_marian.py +6 -6
  837. transformers/models/markuplm/configuration_markuplm.py +4 -7
  838. transformers/models/markuplm/feature_extraction_markuplm.py +1 -2
  839. transformers/models/markuplm/modeling_markuplm.py +63 -65
  840. transformers/models/markuplm/processing_markuplm.py +31 -38
  841. transformers/models/markuplm/tokenization_markuplm.py +67 -77
  842. transformers/models/mask2former/configuration_mask2former.py +14 -52
  843. transformers/models/mask2former/image_processing_mask2former.py +84 -85
  844. transformers/models/mask2former/image_processing_mask2former_fast.py +36 -36
  845. transformers/models/mask2former/modeling_mask2former.py +108 -104
  846. transformers/models/mask2former/modular_mask2former.py +6 -8
  847. transformers/models/maskformer/configuration_maskformer.py +17 -51
  848. transformers/models/maskformer/configuration_maskformer_swin.py +2 -5
  849. transformers/models/maskformer/image_processing_maskformer.py +84 -85
  850. transformers/models/maskformer/image_processing_maskformer_fast.py +35 -36
  851. transformers/models/maskformer/modeling_maskformer.py +71 -67
  852. transformers/models/maskformer/modeling_maskformer_swin.py +20 -23
  853. transformers/models/mbart/configuration_mbart.py +9 -5
  854. transformers/models/mbart/modeling_mbart.py +120 -119
  855. transformers/models/mbart/tokenization_mbart.py +2 -4
  856. transformers/models/mbart50/tokenization_mbart50.py +3 -5
  857. transformers/models/megatron_bert/configuration_megatron_bert.py +13 -3
  858. transformers/models/megatron_bert/modeling_megatron_bert.py +139 -165
  859. transformers/models/metaclip_2/configuration_metaclip_2.py +4 -1
  860. transformers/models/metaclip_2/modeling_metaclip_2.py +94 -87
  861. transformers/models/metaclip_2/modular_metaclip_2.py +59 -45
  862. transformers/models/mgp_str/configuration_mgp_str.py +0 -1
  863. transformers/models/mgp_str/modeling_mgp_str.py +18 -18
  864. transformers/models/mgp_str/processing_mgp_str.py +3 -20
  865. transformers/models/mgp_str/tokenization_mgp_str.py +1 -3
  866. transformers/models/mimi/configuration_mimi.py +42 -40
  867. transformers/models/mimi/modeling_mimi.py +116 -115
  868. transformers/models/minimax/__init__.py +0 -1
  869. transformers/models/minimax/configuration_minimax.py +40 -47
  870. transformers/models/minimax/modeling_minimax.py +46 -49
  871. transformers/models/minimax/modular_minimax.py +59 -65
  872. transformers/models/minimax_m2/__init__.py +28 -0
  873. transformers/models/minimax_m2/configuration_minimax_m2.py +188 -0
  874. transformers/models/minimax_m2/modeling_minimax_m2.py +704 -0
  875. transformers/models/minimax_m2/modular_minimax_m2.py +346 -0
  876. transformers/models/ministral/configuration_ministral.py +25 -29
  877. transformers/models/ministral/modeling_ministral.py +35 -37
  878. transformers/models/ministral/modular_ministral.py +32 -37
  879. transformers/models/ministral3/configuration_ministral3.py +23 -26
  880. transformers/models/ministral3/modeling_ministral3.py +35 -37
  881. transformers/models/ministral3/modular_ministral3.py +7 -8
  882. transformers/models/mistral/configuration_mistral.py +24 -29
  883. transformers/models/mistral/modeling_mistral.py +35 -37
  884. transformers/models/mistral/modular_mistral.py +14 -15
  885. transformers/models/mistral3/configuration_mistral3.py +4 -1
  886. transformers/models/mistral3/modeling_mistral3.py +79 -82
  887. transformers/models/mistral3/modular_mistral3.py +66 -67
  888. transformers/models/mixtral/configuration_mixtral.py +32 -38
  889. transformers/models/mixtral/modeling_mixtral.py +39 -42
  890. transformers/models/mixtral/modular_mixtral.py +26 -29
  891. transformers/models/mlcd/configuration_mlcd.py +0 -1
  892. transformers/models/mlcd/modeling_mlcd.py +17 -17
  893. transformers/models/mlcd/modular_mlcd.py +16 -16
  894. transformers/models/mllama/configuration_mllama.py +10 -15
  895. transformers/models/mllama/image_processing_mllama.py +23 -25
  896. transformers/models/mllama/image_processing_mllama_fast.py +11 -11
  897. transformers/models/mllama/modeling_mllama.py +100 -103
  898. transformers/models/mllama/processing_mllama.py +6 -55
  899. transformers/models/mluke/tokenization_mluke.py +97 -103
  900. transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +10 -46
  901. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +159 -179
  902. transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +10 -46
  903. transformers/models/mobilebert/configuration_mobilebert.py +4 -2
  904. transformers/models/mobilebert/modeling_mobilebert.py +78 -88
  905. transformers/models/mobilebert/tokenization_mobilebert.py +0 -1
  906. transformers/models/mobilenet_v1/configuration_mobilenet_v1.py +0 -1
  907. transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py +20 -23
  908. transformers/models/mobilenet_v1/image_processing_mobilenet_v1_fast.py +0 -1
  909. transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +13 -16
  910. transformers/models/mobilenet_v2/configuration_mobilenet_v2.py +0 -1
  911. transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py +48 -51
  912. transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py +14 -15
  913. transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +21 -22
  914. transformers/models/mobilevit/configuration_mobilevit.py +0 -1
  915. transformers/models/mobilevit/image_processing_mobilevit.py +41 -44
  916. transformers/models/mobilevit/image_processing_mobilevit_fast.py +12 -13
  917. transformers/models/mobilevit/modeling_mobilevit.py +21 -21
  918. transformers/models/mobilevitv2/configuration_mobilevitv2.py +0 -1
  919. transformers/models/mobilevitv2/modeling_mobilevitv2.py +21 -22
  920. transformers/models/modernbert/configuration_modernbert.py +76 -51
  921. transformers/models/modernbert/modeling_modernbert.py +188 -943
  922. transformers/models/modernbert/modular_modernbert.py +255 -978
  923. transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +50 -44
  924. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +54 -64
  925. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +92 -92
  926. transformers/models/moonshine/configuration_moonshine.py +34 -31
  927. transformers/models/moonshine/modeling_moonshine.py +70 -72
  928. transformers/models/moonshine/modular_moonshine.py +91 -86
  929. transformers/models/moshi/configuration_moshi.py +46 -23
  930. transformers/models/moshi/modeling_moshi.py +134 -142
  931. transformers/models/mpnet/configuration_mpnet.py +6 -2
  932. transformers/models/mpnet/modeling_mpnet.py +55 -57
  933. transformers/models/mpnet/tokenization_mpnet.py +1 -4
  934. transformers/models/mpt/configuration_mpt.py +17 -9
  935. transformers/models/mpt/modeling_mpt.py +58 -60
  936. transformers/models/mra/configuration_mra.py +8 -2
  937. transformers/models/mra/modeling_mra.py +54 -56
  938. transformers/models/mt5/configuration_mt5.py +9 -6
  939. transformers/models/mt5/modeling_mt5.py +80 -85
  940. transformers/models/musicgen/configuration_musicgen.py +12 -8
  941. transformers/models/musicgen/modeling_musicgen.py +114 -116
  942. transformers/models/musicgen/processing_musicgen.py +3 -21
  943. transformers/models/musicgen_melody/configuration_musicgen_melody.py +15 -8
  944. transformers/models/musicgen_melody/feature_extraction_musicgen_melody.py +8 -9
  945. transformers/models/musicgen_melody/modeling_musicgen_melody.py +113 -126
  946. transformers/models/musicgen_melody/processing_musicgen_melody.py +3 -22
  947. transformers/models/mvp/configuration_mvp.py +8 -5
  948. transformers/models/mvp/modeling_mvp.py +121 -123
  949. transformers/models/myt5/tokenization_myt5.py +8 -10
  950. transformers/models/nanochat/configuration_nanochat.py +5 -8
  951. transformers/models/nanochat/modeling_nanochat.py +36 -39
  952. transformers/models/nanochat/modular_nanochat.py +16 -18
  953. transformers/models/nemotron/configuration_nemotron.py +25 -30
  954. transformers/models/nemotron/modeling_nemotron.py +53 -66
  955. transformers/models/nllb/tokenization_nllb.py +14 -14
  956. transformers/models/nllb_moe/configuration_nllb_moe.py +7 -10
  957. transformers/models/nllb_moe/modeling_nllb_moe.py +70 -72
  958. transformers/models/nougat/image_processing_nougat.py +29 -32
  959. transformers/models/nougat/image_processing_nougat_fast.py +12 -13
  960. transformers/models/nougat/processing_nougat.py +37 -39
  961. transformers/models/nougat/tokenization_nougat.py +5 -7
  962. transformers/models/nystromformer/configuration_nystromformer.py +8 -2
  963. transformers/models/nystromformer/modeling_nystromformer.py +61 -63
  964. transformers/models/olmo/configuration_olmo.py +23 -28
  965. transformers/models/olmo/modeling_olmo.py +35 -38
  966. transformers/models/olmo/modular_olmo.py +8 -12
  967. transformers/models/olmo2/configuration_olmo2.py +27 -32
  968. transformers/models/olmo2/modeling_olmo2.py +36 -39
  969. transformers/models/olmo2/modular_olmo2.py +36 -38
  970. transformers/models/olmo3/__init__.py +0 -1
  971. transformers/models/olmo3/configuration_olmo3.py +30 -34
  972. transformers/models/olmo3/modeling_olmo3.py +35 -38
  973. transformers/models/olmo3/modular_olmo3.py +44 -47
  974. transformers/models/olmoe/configuration_olmoe.py +29 -33
  975. transformers/models/olmoe/modeling_olmoe.py +41 -43
  976. transformers/models/olmoe/modular_olmoe.py +15 -16
  977. transformers/models/omdet_turbo/configuration_omdet_turbo.py +14 -50
  978. transformers/models/omdet_turbo/modeling_omdet_turbo.py +59 -57
  979. transformers/models/omdet_turbo/processing_omdet_turbo.py +19 -67
  980. transformers/models/oneformer/configuration_oneformer.py +11 -51
  981. transformers/models/oneformer/image_processing_oneformer.py +83 -84
  982. transformers/models/oneformer/image_processing_oneformer_fast.py +41 -42
  983. transformers/models/oneformer/modeling_oneformer.py +137 -133
  984. transformers/models/oneformer/processing_oneformer.py +28 -43
  985. transformers/models/openai/configuration_openai.py +16 -1
  986. transformers/models/openai/modeling_openai.py +50 -51
  987. transformers/models/openai/tokenization_openai.py +2 -5
  988. transformers/models/opt/configuration_opt.py +6 -7
  989. transformers/models/opt/modeling_opt.py +79 -80
  990. transformers/models/ovis2/__init__.py +0 -1
  991. transformers/models/ovis2/configuration_ovis2.py +4 -1
  992. transformers/models/ovis2/image_processing_ovis2.py +22 -24
  993. transformers/models/ovis2/image_processing_ovis2_fast.py +9 -10
  994. transformers/models/ovis2/modeling_ovis2.py +99 -142
  995. transformers/models/ovis2/modular_ovis2.py +82 -45
  996. transformers/models/ovis2/processing_ovis2.py +12 -40
  997. transformers/models/owlv2/configuration_owlv2.py +4 -2
  998. transformers/models/owlv2/image_processing_owlv2.py +20 -21
  999. transformers/models/owlv2/image_processing_owlv2_fast.py +12 -13
  1000. transformers/models/owlv2/modeling_owlv2.py +122 -114
  1001. transformers/models/owlv2/modular_owlv2.py +11 -12
  1002. transformers/models/owlv2/processing_owlv2.py +20 -49
  1003. transformers/models/owlvit/configuration_owlvit.py +4 -2
  1004. transformers/models/owlvit/image_processing_owlvit.py +21 -22
  1005. transformers/models/owlvit/image_processing_owlvit_fast.py +2 -3
  1006. transformers/models/owlvit/modeling_owlvit.py +121 -113
  1007. transformers/models/owlvit/processing_owlvit.py +20 -48
  1008. transformers/models/paddleocr_vl/__init__.py +0 -1
  1009. transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +28 -29
  1010. transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +34 -35
  1011. transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +12 -12
  1012. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +159 -158
  1013. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +148 -119
  1014. transformers/models/paddleocr_vl/processing_paddleocr_vl.py +1 -3
  1015. transformers/models/paligemma/configuration_paligemma.py +4 -1
  1016. transformers/models/paligemma/modeling_paligemma.py +81 -79
  1017. transformers/models/paligemma/processing_paligemma.py +13 -66
  1018. transformers/models/parakeet/configuration_parakeet.py +3 -8
  1019. transformers/models/parakeet/feature_extraction_parakeet.py +10 -12
  1020. transformers/models/parakeet/modeling_parakeet.py +21 -25
  1021. transformers/models/parakeet/modular_parakeet.py +19 -21
  1022. transformers/models/parakeet/processing_parakeet.py +12 -5
  1023. transformers/models/parakeet/tokenization_parakeet.py +2 -4
  1024. transformers/models/patchtsmixer/configuration_patchtsmixer.py +5 -8
  1025. transformers/models/patchtsmixer/modeling_patchtsmixer.py +63 -65
  1026. transformers/models/patchtst/configuration_patchtst.py +6 -9
  1027. transformers/models/patchtst/modeling_patchtst.py +75 -77
  1028. transformers/models/pe_audio/__init__.py +0 -1
  1029. transformers/models/pe_audio/configuration_pe_audio.py +14 -16
  1030. transformers/models/pe_audio/feature_extraction_pe_audio.py +6 -8
  1031. transformers/models/pe_audio/modeling_pe_audio.py +30 -31
  1032. transformers/models/pe_audio/modular_pe_audio.py +17 -18
  1033. transformers/models/pe_audio/processing_pe_audio.py +0 -1
  1034. transformers/models/pe_audio_video/__init__.py +0 -1
  1035. transformers/models/pe_audio_video/configuration_pe_audio_video.py +15 -17
  1036. transformers/models/pe_audio_video/modeling_pe_audio_video.py +64 -65
  1037. transformers/models/pe_audio_video/modular_pe_audio_video.py +56 -57
  1038. transformers/models/pe_audio_video/processing_pe_audio_video.py +0 -1
  1039. transformers/models/pe_video/__init__.py +0 -1
  1040. transformers/models/pe_video/configuration_pe_video.py +14 -16
  1041. transformers/models/pe_video/modeling_pe_video.py +57 -46
  1042. transformers/models/pe_video/modular_pe_video.py +47 -35
  1043. transformers/models/pe_video/video_processing_pe_video.py +2 -4
  1044. transformers/models/pegasus/configuration_pegasus.py +8 -6
  1045. transformers/models/pegasus/modeling_pegasus.py +67 -69
  1046. transformers/models/pegasus/tokenization_pegasus.py +1 -4
  1047. transformers/models/pegasus_x/configuration_pegasus_x.py +5 -4
  1048. transformers/models/pegasus_x/modeling_pegasus_x.py +53 -55
  1049. transformers/models/perceiver/configuration_perceiver.py +0 -1
  1050. transformers/models/perceiver/image_processing_perceiver.py +22 -25
  1051. transformers/models/perceiver/image_processing_perceiver_fast.py +7 -8
  1052. transformers/models/perceiver/modeling_perceiver.py +152 -145
  1053. transformers/models/perceiver/tokenization_perceiver.py +3 -6
  1054. transformers/models/perception_lm/configuration_perception_lm.py +0 -1
  1055. transformers/models/perception_lm/image_processing_perception_lm_fast.py +8 -9
  1056. transformers/models/perception_lm/modeling_perception_lm.py +64 -67
  1057. transformers/models/perception_lm/modular_perception_lm.py +58 -58
  1058. transformers/models/perception_lm/processing_perception_lm.py +13 -47
  1059. transformers/models/perception_lm/video_processing_perception_lm.py +0 -1
  1060. transformers/models/persimmon/configuration_persimmon.py +23 -28
  1061. transformers/models/persimmon/modeling_persimmon.py +44 -47
  1062. transformers/models/phi/configuration_phi.py +27 -28
  1063. transformers/models/phi/modeling_phi.py +39 -41
  1064. transformers/models/phi/modular_phi.py +26 -26
  1065. transformers/models/phi3/configuration_phi3.py +32 -37
  1066. transformers/models/phi3/modeling_phi3.py +37 -40
  1067. transformers/models/phi3/modular_phi3.py +16 -20
  1068. transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +36 -39
  1069. transformers/models/phi4_multimodal/feature_extraction_phi4_multimodal.py +7 -9
  1070. transformers/models/phi4_multimodal/image_processing_phi4_multimodal_fast.py +11 -11
  1071. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +100 -117
  1072. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +103 -90
  1073. transformers/models/phi4_multimodal/processing_phi4_multimodal.py +7 -42
  1074. transformers/models/phimoe/configuration_phimoe.py +31 -36
  1075. transformers/models/phimoe/modeling_phimoe.py +50 -77
  1076. transformers/models/phimoe/modular_phimoe.py +12 -8
  1077. transformers/models/phobert/tokenization_phobert.py +4 -6
  1078. transformers/models/pix2struct/configuration_pix2struct.py +12 -10
  1079. transformers/models/pix2struct/image_processing_pix2struct.py +15 -19
  1080. transformers/models/pix2struct/image_processing_pix2struct_fast.py +12 -15
  1081. transformers/models/pix2struct/modeling_pix2struct.py +56 -52
  1082. transformers/models/pix2struct/processing_pix2struct.py +5 -26
  1083. transformers/models/pixio/__init__.py +0 -1
  1084. transformers/models/pixio/configuration_pixio.py +2 -5
  1085. transformers/models/pixio/modeling_pixio.py +16 -17
  1086. transformers/models/pixio/modular_pixio.py +7 -8
  1087. transformers/models/pixtral/configuration_pixtral.py +11 -14
  1088. transformers/models/pixtral/image_processing_pixtral.py +26 -28
  1089. transformers/models/pixtral/image_processing_pixtral_fast.py +10 -11
  1090. transformers/models/pixtral/modeling_pixtral.py +31 -37
  1091. transformers/models/pixtral/processing_pixtral.py +18 -52
  1092. transformers/models/plbart/configuration_plbart.py +8 -6
  1093. transformers/models/plbart/modeling_plbart.py +109 -109
  1094. transformers/models/plbart/modular_plbart.py +31 -33
  1095. transformers/models/plbart/tokenization_plbart.py +4 -5
  1096. transformers/models/poolformer/configuration_poolformer.py +0 -1
  1097. transformers/models/poolformer/image_processing_poolformer.py +21 -24
  1098. transformers/models/poolformer/image_processing_poolformer_fast.py +13 -14
  1099. transformers/models/poolformer/modeling_poolformer.py +10 -12
  1100. transformers/models/pop2piano/configuration_pop2piano.py +7 -7
  1101. transformers/models/pop2piano/feature_extraction_pop2piano.py +6 -9
  1102. transformers/models/pop2piano/modeling_pop2piano.py +24 -24
  1103. transformers/models/pop2piano/processing_pop2piano.py +25 -33
  1104. transformers/models/pop2piano/tokenization_pop2piano.py +15 -23
  1105. transformers/models/pp_doclayout_v3/__init__.py +30 -0
  1106. transformers/models/pp_doclayout_v3/configuration_pp_doclayout_v3.py +277 -0
  1107. transformers/models/pp_doclayout_v3/image_processing_pp_doclayout_v3_fast.py +305 -0
  1108. transformers/models/pp_doclayout_v3/modeling_pp_doclayout_v3.py +2083 -0
  1109. transformers/models/pp_doclayout_v3/modular_pp_doclayout_v3.py +1549 -0
  1110. transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +13 -46
  1111. transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything.py +28 -28
  1112. transformers/models/prompt_depth_anything/image_processing_prompt_depth_anything_fast.py +20 -21
  1113. transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +17 -16
  1114. transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +21 -20
  1115. transformers/models/prophetnet/configuration_prophetnet.py +37 -38
  1116. transformers/models/prophetnet/modeling_prophetnet.py +121 -153
  1117. transformers/models/prophetnet/tokenization_prophetnet.py +14 -16
  1118. transformers/models/pvt/configuration_pvt.py +0 -1
  1119. transformers/models/pvt/image_processing_pvt.py +24 -27
  1120. transformers/models/pvt/image_processing_pvt_fast.py +1 -2
  1121. transformers/models/pvt/modeling_pvt.py +19 -21
  1122. transformers/models/pvt_v2/configuration_pvt_v2.py +4 -8
  1123. transformers/models/pvt_v2/modeling_pvt_v2.py +27 -28
  1124. transformers/models/qwen2/configuration_qwen2.py +32 -25
  1125. transformers/models/qwen2/modeling_qwen2.py +35 -37
  1126. transformers/models/qwen2/modular_qwen2.py +14 -15
  1127. transformers/models/qwen2/tokenization_qwen2.py +2 -9
  1128. transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +36 -27
  1129. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +241 -214
  1130. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +228 -193
  1131. transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py +41 -49
  1132. transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +28 -34
  1133. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +188 -145
  1134. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +64 -91
  1135. transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py +7 -43
  1136. transformers/models/qwen2_audio/configuration_qwen2_audio.py +0 -1
  1137. transformers/models/qwen2_audio/modeling_qwen2_audio.py +39 -41
  1138. transformers/models/qwen2_audio/processing_qwen2_audio.py +13 -42
  1139. transformers/models/qwen2_moe/configuration_qwen2_moe.py +42 -35
  1140. transformers/models/qwen2_moe/modeling_qwen2_moe.py +40 -43
  1141. transformers/models/qwen2_moe/modular_qwen2_moe.py +10 -13
  1142. transformers/models/qwen2_vl/configuration_qwen2_vl.py +28 -33
  1143. transformers/models/qwen2_vl/image_processing_qwen2_vl.py +38 -40
  1144. transformers/models/qwen2_vl/image_processing_qwen2_vl_fast.py +12 -15
  1145. transformers/models/qwen2_vl/modeling_qwen2_vl.py +184 -141
  1146. transformers/models/qwen2_vl/processing_qwen2_vl.py +7 -44
  1147. transformers/models/qwen2_vl/video_processing_qwen2_vl.py +38 -18
  1148. transformers/models/qwen3/configuration_qwen3.py +34 -27
  1149. transformers/models/qwen3/modeling_qwen3.py +35 -38
  1150. transformers/models/qwen3/modular_qwen3.py +7 -9
  1151. transformers/models/qwen3_moe/configuration_qwen3_moe.py +45 -35
  1152. transformers/models/qwen3_moe/modeling_qwen3_moe.py +40 -43
  1153. transformers/models/qwen3_moe/modular_qwen3_moe.py +10 -13
  1154. transformers/models/qwen3_next/configuration_qwen3_next.py +47 -38
  1155. transformers/models/qwen3_next/modeling_qwen3_next.py +44 -47
  1156. transformers/models/qwen3_next/modular_qwen3_next.py +37 -38
  1157. transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +139 -106
  1158. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +266 -206
  1159. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +228 -181
  1160. transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py +40 -48
  1161. transformers/models/qwen3_vl/configuration_qwen3_vl.py +22 -24
  1162. transformers/models/qwen3_vl/modeling_qwen3_vl.py +185 -122
  1163. transformers/models/qwen3_vl/modular_qwen3_vl.py +153 -139
  1164. transformers/models/qwen3_vl/processing_qwen3_vl.py +6 -42
  1165. transformers/models/qwen3_vl/video_processing_qwen3_vl.py +10 -12
  1166. transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +27 -30
  1167. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +249 -178
  1168. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +55 -42
  1169. transformers/models/rag/configuration_rag.py +6 -7
  1170. transformers/models/rag/modeling_rag.py +119 -121
  1171. transformers/models/rag/retrieval_rag.py +3 -5
  1172. transformers/models/rag/tokenization_rag.py +0 -50
  1173. transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +29 -30
  1174. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +35 -39
  1175. transformers/models/reformer/configuration_reformer.py +7 -8
  1176. transformers/models/reformer/modeling_reformer.py +67 -68
  1177. transformers/models/reformer/tokenization_reformer.py +3 -6
  1178. transformers/models/regnet/configuration_regnet.py +0 -1
  1179. transformers/models/regnet/modeling_regnet.py +7 -9
  1180. transformers/models/rembert/configuration_rembert.py +8 -2
  1181. transformers/models/rembert/modeling_rembert.py +108 -132
  1182. transformers/models/rembert/tokenization_rembert.py +1 -4
  1183. transformers/models/resnet/configuration_resnet.py +2 -5
  1184. transformers/models/resnet/modeling_resnet.py +14 -15
  1185. transformers/models/roberta/configuration_roberta.py +11 -3
  1186. transformers/models/roberta/modeling_roberta.py +97 -99
  1187. transformers/models/roberta/modular_roberta.py +55 -58
  1188. transformers/models/roberta/tokenization_roberta.py +2 -5
  1189. transformers/models/roberta/tokenization_roberta_old.py +2 -4
  1190. transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py +11 -3
  1191. transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +97 -99
  1192. transformers/models/roc_bert/configuration_roc_bert.py +8 -2
  1193. transformers/models/roc_bert/modeling_roc_bert.py +125 -162
  1194. transformers/models/roc_bert/tokenization_roc_bert.py +88 -94
  1195. transformers/models/roformer/configuration_roformer.py +13 -3
  1196. transformers/models/roformer/modeling_roformer.py +79 -95
  1197. transformers/models/roformer/tokenization_roformer.py +3 -6
  1198. transformers/models/roformer/tokenization_utils.py +0 -1
  1199. transformers/models/rt_detr/configuration_rt_detr.py +8 -50
  1200. transformers/models/rt_detr/configuration_rt_detr_resnet.py +2 -5
  1201. transformers/models/rt_detr/image_processing_rt_detr.py +54 -55
  1202. transformers/models/rt_detr/image_processing_rt_detr_fast.py +39 -26
  1203. transformers/models/rt_detr/modeling_rt_detr.py +643 -804
  1204. transformers/models/rt_detr/modeling_rt_detr_resnet.py +4 -7
  1205. transformers/models/rt_detr/modular_rt_detr.py +1522 -20
  1206. transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +12 -58
  1207. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +384 -521
  1208. transformers/models/rt_detr_v2/modular_rt_detr_v2.py +27 -70
  1209. transformers/models/rwkv/configuration_rwkv.py +2 -4
  1210. transformers/models/rwkv/modeling_rwkv.py +29 -54
  1211. transformers/models/sam/configuration_sam.py +2 -1
  1212. transformers/models/sam/image_processing_sam.py +59 -60
  1213. transformers/models/sam/image_processing_sam_fast.py +25 -26
  1214. transformers/models/sam/modeling_sam.py +46 -43
  1215. transformers/models/sam/processing_sam.py +39 -27
  1216. transformers/models/sam2/configuration_sam2.py +1 -2
  1217. transformers/models/sam2/image_processing_sam2_fast.py +14 -15
  1218. transformers/models/sam2/modeling_sam2.py +96 -94
  1219. transformers/models/sam2/modular_sam2.py +85 -94
  1220. transformers/models/sam2/processing_sam2.py +31 -47
  1221. transformers/models/sam2_video/configuration_sam2_video.py +0 -1
  1222. transformers/models/sam2_video/modeling_sam2_video.py +114 -116
  1223. transformers/models/sam2_video/modular_sam2_video.py +72 -89
  1224. transformers/models/sam2_video/processing_sam2_video.py +49 -66
  1225. transformers/models/sam2_video/video_processing_sam2_video.py +1 -4
  1226. transformers/models/sam3/configuration_sam3.py +0 -1
  1227. transformers/models/sam3/image_processing_sam3_fast.py +17 -20
  1228. transformers/models/sam3/modeling_sam3.py +94 -100
  1229. transformers/models/sam3/modular_sam3.py +3 -8
  1230. transformers/models/sam3/processing_sam3.py +37 -52
  1231. transformers/models/sam3_tracker/__init__.py +0 -1
  1232. transformers/models/sam3_tracker/configuration_sam3_tracker.py +1 -3
  1233. transformers/models/sam3_tracker/modeling_sam3_tracker.py +79 -80
  1234. transformers/models/sam3_tracker/modular_sam3_tracker.py +0 -2
  1235. transformers/models/sam3_tracker/processing_sam3_tracker.py +31 -48
  1236. transformers/models/sam3_tracker_video/__init__.py +0 -1
  1237. transformers/models/sam3_tracker_video/configuration_sam3_tracker_video.py +0 -1
  1238. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +115 -114
  1239. transformers/models/sam3_tracker_video/modular_sam3_tracker_video.py +10 -24
  1240. transformers/models/sam3_tracker_video/processing_sam3_tracker_video.py +50 -66
  1241. transformers/models/sam3_video/configuration_sam3_video.py +0 -1
  1242. transformers/models/sam3_video/modeling_sam3_video.py +56 -45
  1243. transformers/models/sam3_video/processing_sam3_video.py +25 -45
  1244. transformers/models/sam_hq/__init__.py +1 -1
  1245. transformers/models/sam_hq/configuration_sam_hq.py +2 -1
  1246. transformers/models/sam_hq/modeling_sam_hq.py +52 -50
  1247. transformers/models/sam_hq/modular_sam_hq.py +23 -25
  1248. transformers/models/sam_hq/{processing_samhq.py → processing_sam_hq.py} +41 -29
  1249. transformers/models/seamless_m4t/configuration_seamless_m4t.py +8 -10
  1250. transformers/models/seamless_m4t/feature_extraction_seamless_m4t.py +8 -11
  1251. transformers/models/seamless_m4t/modeling_seamless_m4t.py +180 -182
  1252. transformers/models/seamless_m4t/processing_seamless_m4t.py +18 -39
  1253. transformers/models/seamless_m4t/tokenization_seamless_m4t.py +15 -20
  1254. transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py +8 -10
  1255. transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +193 -195
  1256. transformers/models/seed_oss/configuration_seed_oss.py +30 -34
  1257. transformers/models/seed_oss/modeling_seed_oss.py +34 -36
  1258. transformers/models/seed_oss/modular_seed_oss.py +6 -7
  1259. transformers/models/segformer/configuration_segformer.py +0 -10
  1260. transformers/models/segformer/image_processing_segformer.py +39 -42
  1261. transformers/models/segformer/image_processing_segformer_fast.py +11 -12
  1262. transformers/models/segformer/modeling_segformer.py +28 -28
  1263. transformers/models/segformer/modular_segformer.py +8 -9
  1264. transformers/models/seggpt/configuration_seggpt.py +0 -1
  1265. transformers/models/seggpt/image_processing_seggpt.py +38 -41
  1266. transformers/models/seggpt/modeling_seggpt.py +48 -38
  1267. transformers/models/sew/configuration_sew.py +4 -2
  1268. transformers/models/sew/modeling_sew.py +42 -40
  1269. transformers/models/sew/modular_sew.py +12 -13
  1270. transformers/models/sew_d/configuration_sew_d.py +4 -2
  1271. transformers/models/sew_d/modeling_sew_d.py +32 -31
  1272. transformers/models/shieldgemma2/configuration_shieldgemma2.py +0 -1
  1273. transformers/models/shieldgemma2/modeling_shieldgemma2.py +19 -21
  1274. transformers/models/shieldgemma2/processing_shieldgemma2.py +3 -5
  1275. transformers/models/siglip/configuration_siglip.py +4 -2
  1276. transformers/models/siglip/image_processing_siglip.py +17 -20
  1277. transformers/models/siglip/image_processing_siglip_fast.py +0 -1
  1278. transformers/models/siglip/modeling_siglip.py +65 -110
  1279. transformers/models/siglip/processing_siglip.py +2 -14
  1280. transformers/models/siglip/tokenization_siglip.py +6 -7
  1281. transformers/models/siglip2/__init__.py +1 -0
  1282. transformers/models/siglip2/configuration_siglip2.py +4 -2
  1283. transformers/models/siglip2/image_processing_siglip2.py +15 -16
  1284. transformers/models/siglip2/image_processing_siglip2_fast.py +6 -7
  1285. transformers/models/siglip2/modeling_siglip2.py +89 -130
  1286. transformers/models/siglip2/modular_siglip2.py +95 -48
  1287. transformers/models/siglip2/processing_siglip2.py +2 -14
  1288. transformers/models/siglip2/tokenization_siglip2.py +95 -0
  1289. transformers/models/smollm3/configuration_smollm3.py +29 -32
  1290. transformers/models/smollm3/modeling_smollm3.py +35 -38
  1291. transformers/models/smollm3/modular_smollm3.py +36 -38
  1292. transformers/models/smolvlm/configuration_smolvlm.py +2 -4
  1293. transformers/models/smolvlm/image_processing_smolvlm.py +42 -43
  1294. transformers/models/smolvlm/image_processing_smolvlm_fast.py +41 -15
  1295. transformers/models/smolvlm/modeling_smolvlm.py +124 -96
  1296. transformers/models/smolvlm/modular_smolvlm.py +50 -39
  1297. transformers/models/smolvlm/processing_smolvlm.py +15 -76
  1298. transformers/models/smolvlm/video_processing_smolvlm.py +16 -17
  1299. transformers/models/solar_open/__init__.py +27 -0
  1300. transformers/models/solar_open/configuration_solar_open.py +184 -0
  1301. transformers/models/solar_open/modeling_solar_open.py +642 -0
  1302. transformers/models/solar_open/modular_solar_open.py +224 -0
  1303. transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py +0 -1
  1304. transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +26 -27
  1305. transformers/models/speech_to_text/configuration_speech_to_text.py +9 -9
  1306. transformers/models/speech_to_text/feature_extraction_speech_to_text.py +10 -13
  1307. transformers/models/speech_to_text/modeling_speech_to_text.py +55 -57
  1308. transformers/models/speech_to_text/processing_speech_to_text.py +4 -30
  1309. transformers/models/speech_to_text/tokenization_speech_to_text.py +5 -6
  1310. transformers/models/speecht5/configuration_speecht5.py +7 -9
  1311. transformers/models/speecht5/feature_extraction_speecht5.py +16 -37
  1312. transformers/models/speecht5/modeling_speecht5.py +172 -174
  1313. transformers/models/speecht5/number_normalizer.py +0 -1
  1314. transformers/models/speecht5/processing_speecht5.py +3 -37
  1315. transformers/models/speecht5/tokenization_speecht5.py +4 -5
  1316. transformers/models/splinter/configuration_splinter.py +6 -7
  1317. transformers/models/splinter/modeling_splinter.py +62 -59
  1318. transformers/models/splinter/tokenization_splinter.py +2 -4
  1319. transformers/models/squeezebert/configuration_squeezebert.py +14 -2
  1320. transformers/models/squeezebert/modeling_squeezebert.py +60 -62
  1321. transformers/models/squeezebert/tokenization_squeezebert.py +0 -1
  1322. transformers/models/stablelm/configuration_stablelm.py +28 -29
  1323. transformers/models/stablelm/modeling_stablelm.py +44 -47
  1324. transformers/models/starcoder2/configuration_starcoder2.py +30 -27
  1325. transformers/models/starcoder2/modeling_starcoder2.py +38 -41
  1326. transformers/models/starcoder2/modular_starcoder2.py +17 -19
  1327. transformers/models/superglue/configuration_superglue.py +7 -3
  1328. transformers/models/superglue/image_processing_superglue.py +15 -15
  1329. transformers/models/superglue/image_processing_superglue_fast.py +8 -8
  1330. transformers/models/superglue/modeling_superglue.py +41 -37
  1331. transformers/models/superpoint/image_processing_superpoint.py +15 -15
  1332. transformers/models/superpoint/image_processing_superpoint_fast.py +7 -9
  1333. transformers/models/superpoint/modeling_superpoint.py +17 -16
  1334. transformers/models/swiftformer/configuration_swiftformer.py +0 -1
  1335. transformers/models/swiftformer/modeling_swiftformer.py +12 -14
  1336. transformers/models/swin/configuration_swin.py +2 -5
  1337. transformers/models/swin/modeling_swin.py +69 -78
  1338. transformers/models/swin2sr/configuration_swin2sr.py +0 -1
  1339. transformers/models/swin2sr/image_processing_swin2sr.py +10 -13
  1340. transformers/models/swin2sr/image_processing_swin2sr_fast.py +4 -7
  1341. transformers/models/swin2sr/modeling_swin2sr.py +30 -30
  1342. transformers/models/swinv2/configuration_swinv2.py +2 -5
  1343. transformers/models/swinv2/modeling_swinv2.py +65 -74
  1344. transformers/models/switch_transformers/configuration_switch_transformers.py +11 -7
  1345. transformers/models/switch_transformers/modeling_switch_transformers.py +35 -36
  1346. transformers/models/switch_transformers/modular_switch_transformers.py +32 -33
  1347. transformers/models/t5/configuration_t5.py +9 -9
  1348. transformers/models/t5/modeling_t5.py +80 -85
  1349. transformers/models/t5/tokenization_t5.py +1 -3
  1350. transformers/models/t5gemma/configuration_t5gemma.py +43 -59
  1351. transformers/models/t5gemma/modeling_t5gemma.py +105 -108
  1352. transformers/models/t5gemma/modular_t5gemma.py +128 -142
  1353. transformers/models/t5gemma2/configuration_t5gemma2.py +86 -100
  1354. transformers/models/t5gemma2/modeling_t5gemma2.py +234 -194
  1355. transformers/models/t5gemma2/modular_t5gemma2.py +279 -264
  1356. transformers/models/table_transformer/configuration_table_transformer.py +18 -50
  1357. transformers/models/table_transformer/modeling_table_transformer.py +73 -101
  1358. transformers/models/tapas/configuration_tapas.py +12 -2
  1359. transformers/models/tapas/modeling_tapas.py +65 -67
  1360. transformers/models/tapas/tokenization_tapas.py +116 -153
  1361. transformers/models/textnet/configuration_textnet.py +4 -7
  1362. transformers/models/textnet/image_processing_textnet.py +22 -25
  1363. transformers/models/textnet/image_processing_textnet_fast.py +8 -9
  1364. transformers/models/textnet/modeling_textnet.py +28 -28
  1365. transformers/models/time_series_transformer/configuration_time_series_transformer.py +5 -8
  1366. transformers/models/time_series_transformer/modeling_time_series_transformer.py +82 -84
  1367. transformers/models/timesfm/configuration_timesfm.py +0 -1
  1368. transformers/models/timesfm/modeling_timesfm.py +22 -25
  1369. transformers/models/timesfm/modular_timesfm.py +21 -24
  1370. transformers/models/timesformer/configuration_timesformer.py +0 -1
  1371. transformers/models/timesformer/modeling_timesformer.py +13 -16
  1372. transformers/models/timm_backbone/configuration_timm_backbone.py +33 -8
  1373. transformers/models/timm_backbone/modeling_timm_backbone.py +25 -30
  1374. transformers/models/timm_wrapper/configuration_timm_wrapper.py +2 -3
  1375. transformers/models/timm_wrapper/image_processing_timm_wrapper.py +4 -5
  1376. transformers/models/timm_wrapper/modeling_timm_wrapper.py +22 -19
  1377. transformers/models/trocr/configuration_trocr.py +11 -8
  1378. transformers/models/trocr/modeling_trocr.py +42 -42
  1379. transformers/models/trocr/processing_trocr.py +5 -25
  1380. transformers/models/tvp/configuration_tvp.py +10 -36
  1381. transformers/models/tvp/image_processing_tvp.py +50 -52
  1382. transformers/models/tvp/image_processing_tvp_fast.py +15 -15
  1383. transformers/models/tvp/modeling_tvp.py +26 -28
  1384. transformers/models/tvp/processing_tvp.py +2 -14
  1385. transformers/models/udop/configuration_udop.py +16 -8
  1386. transformers/models/udop/modeling_udop.py +73 -72
  1387. transformers/models/udop/processing_udop.py +7 -26
  1388. transformers/models/udop/tokenization_udop.py +80 -93
  1389. transformers/models/umt5/configuration_umt5.py +8 -7
  1390. transformers/models/umt5/modeling_umt5.py +87 -84
  1391. transformers/models/unispeech/configuration_unispeech.py +4 -2
  1392. transformers/models/unispeech/modeling_unispeech.py +54 -53
  1393. transformers/models/unispeech/modular_unispeech.py +20 -22
  1394. transformers/models/unispeech_sat/configuration_unispeech_sat.py +4 -2
  1395. transformers/models/unispeech_sat/modeling_unispeech_sat.py +70 -69
  1396. transformers/models/unispeech_sat/modular_unispeech_sat.py +21 -23
  1397. transformers/models/univnet/feature_extraction_univnet.py +14 -14
  1398. transformers/models/univnet/modeling_univnet.py +7 -8
  1399. transformers/models/upernet/configuration_upernet.py +8 -36
  1400. transformers/models/upernet/modeling_upernet.py +11 -14
  1401. transformers/models/vaultgemma/__init__.py +0 -1
  1402. transformers/models/vaultgemma/configuration_vaultgemma.py +29 -33
  1403. transformers/models/vaultgemma/modeling_vaultgemma.py +38 -40
  1404. transformers/models/vaultgemma/modular_vaultgemma.py +29 -31
  1405. transformers/models/video_llama_3/configuration_video_llama_3.py +4 -0
  1406. transformers/models/video_llama_3/image_processing_video_llama_3.py +40 -40
  1407. transformers/models/video_llama_3/image_processing_video_llama_3_fast.py +12 -14
  1408. transformers/models/video_llama_3/modeling_video_llama_3.py +149 -112
  1409. transformers/models/video_llama_3/modular_video_llama_3.py +152 -150
  1410. transformers/models/video_llama_3/processing_video_llama_3.py +5 -39
  1411. transformers/models/video_llama_3/video_processing_video_llama_3.py +45 -24
  1412. transformers/models/video_llava/configuration_video_llava.py +4 -1
  1413. transformers/models/video_llava/image_processing_video_llava.py +35 -38
  1414. transformers/models/video_llava/modeling_video_llava.py +139 -143
  1415. transformers/models/video_llava/processing_video_llava.py +38 -78
  1416. transformers/models/video_llava/video_processing_video_llava.py +0 -1
  1417. transformers/models/videomae/configuration_videomae.py +0 -1
  1418. transformers/models/videomae/image_processing_videomae.py +31 -34
  1419. transformers/models/videomae/modeling_videomae.py +17 -20
  1420. transformers/models/videomae/video_processing_videomae.py +0 -1
  1421. transformers/models/vilt/configuration_vilt.py +4 -2
  1422. transformers/models/vilt/image_processing_vilt.py +29 -30
  1423. transformers/models/vilt/image_processing_vilt_fast.py +15 -16
  1424. transformers/models/vilt/modeling_vilt.py +103 -90
  1425. transformers/models/vilt/processing_vilt.py +2 -14
  1426. transformers/models/vipllava/configuration_vipllava.py +4 -1
  1427. transformers/models/vipllava/modeling_vipllava.py +92 -67
  1428. transformers/models/vipllava/modular_vipllava.py +78 -54
  1429. transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py +0 -1
  1430. transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py +28 -27
  1431. transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py +0 -1
  1432. transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +45 -41
  1433. transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py +2 -16
  1434. transformers/models/visual_bert/configuration_visual_bert.py +6 -2
  1435. transformers/models/visual_bert/modeling_visual_bert.py +90 -92
  1436. transformers/models/vit/configuration_vit.py +2 -3
  1437. transformers/models/vit/image_processing_vit.py +19 -22
  1438. transformers/models/vit/image_processing_vit_fast.py +0 -1
  1439. transformers/models/vit/modeling_vit.py +20 -20
  1440. transformers/models/vit_mae/configuration_vit_mae.py +0 -1
  1441. transformers/models/vit_mae/modeling_vit_mae.py +32 -30
  1442. transformers/models/vit_msn/configuration_vit_msn.py +0 -1
  1443. transformers/models/vit_msn/modeling_vit_msn.py +21 -19
  1444. transformers/models/vitdet/configuration_vitdet.py +2 -5
  1445. transformers/models/vitdet/modeling_vitdet.py +14 -17
  1446. transformers/models/vitmatte/configuration_vitmatte.py +7 -39
  1447. transformers/models/vitmatte/image_processing_vitmatte.py +15 -18
  1448. transformers/models/vitmatte/image_processing_vitmatte_fast.py +16 -17
  1449. transformers/models/vitmatte/modeling_vitmatte.py +10 -12
  1450. transformers/models/vitpose/configuration_vitpose.py +7 -47
  1451. transformers/models/vitpose/image_processing_vitpose.py +24 -25
  1452. transformers/models/vitpose/image_processing_vitpose_fast.py +9 -10
  1453. transformers/models/vitpose/modeling_vitpose.py +15 -15
  1454. transformers/models/vitpose_backbone/configuration_vitpose_backbone.py +2 -5
  1455. transformers/models/vitpose_backbone/modeling_vitpose_backbone.py +13 -16
  1456. transformers/models/vits/configuration_vits.py +4 -1
  1457. transformers/models/vits/modeling_vits.py +43 -42
  1458. transformers/models/vits/tokenization_vits.py +3 -4
  1459. transformers/models/vivit/configuration_vivit.py +0 -1
  1460. transformers/models/vivit/image_processing_vivit.py +36 -39
  1461. transformers/models/vivit/modeling_vivit.py +9 -11
  1462. transformers/models/vjepa2/__init__.py +0 -1
  1463. transformers/models/vjepa2/configuration_vjepa2.py +0 -1
  1464. transformers/models/vjepa2/modeling_vjepa2.py +39 -41
  1465. transformers/models/vjepa2/video_processing_vjepa2.py +0 -1
  1466. transformers/models/voxtral/__init__.py +0 -1
  1467. transformers/models/voxtral/configuration_voxtral.py +0 -2
  1468. transformers/models/voxtral/modeling_voxtral.py +41 -48
  1469. transformers/models/voxtral/modular_voxtral.py +35 -38
  1470. transformers/models/voxtral/processing_voxtral.py +25 -48
  1471. transformers/models/wav2vec2/configuration_wav2vec2.py +4 -2
  1472. transformers/models/wav2vec2/feature_extraction_wav2vec2.py +7 -10
  1473. transformers/models/wav2vec2/modeling_wav2vec2.py +74 -126
  1474. transformers/models/wav2vec2/processing_wav2vec2.py +6 -35
  1475. transformers/models/wav2vec2/tokenization_wav2vec2.py +20 -332
  1476. transformers/models/wav2vec2_bert/configuration_wav2vec2_bert.py +4 -2
  1477. transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +49 -52
  1478. transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +45 -48
  1479. transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py +6 -35
  1480. transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py +4 -2
  1481. transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +62 -65
  1482. transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py +15 -18
  1483. transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py +16 -17
  1484. transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py +36 -55
  1485. transformers/models/wavlm/configuration_wavlm.py +4 -2
  1486. transformers/models/wavlm/modeling_wavlm.py +49 -49
  1487. transformers/models/wavlm/modular_wavlm.py +4 -5
  1488. transformers/models/whisper/configuration_whisper.py +6 -5
  1489. transformers/models/whisper/english_normalizer.py +3 -4
  1490. transformers/models/whisper/feature_extraction_whisper.py +9 -24
  1491. transformers/models/whisper/generation_whisper.py +26 -49
  1492. transformers/models/whisper/modeling_whisper.py +71 -73
  1493. transformers/models/whisper/processing_whisper.py +3 -20
  1494. transformers/models/whisper/tokenization_whisper.py +9 -30
  1495. transformers/models/x_clip/configuration_x_clip.py +4 -2
  1496. transformers/models/x_clip/modeling_x_clip.py +94 -96
  1497. transformers/models/x_clip/processing_x_clip.py +2 -14
  1498. transformers/models/xcodec/configuration_xcodec.py +4 -6
  1499. transformers/models/xcodec/modeling_xcodec.py +15 -17
  1500. transformers/models/xglm/configuration_xglm.py +9 -8
  1501. transformers/models/xglm/modeling_xglm.py +49 -55
  1502. transformers/models/xglm/tokenization_xglm.py +1 -4
  1503. transformers/models/xlm/configuration_xlm.py +10 -8
  1504. transformers/models/xlm/modeling_xlm.py +127 -131
  1505. transformers/models/xlm/tokenization_xlm.py +3 -5
  1506. transformers/models/xlm_roberta/configuration_xlm_roberta.py +11 -3
  1507. transformers/models/xlm_roberta/modeling_xlm_roberta.py +96 -98
  1508. transformers/models/xlm_roberta/modular_xlm_roberta.py +50 -53
  1509. transformers/models/xlm_roberta/tokenization_xlm_roberta.py +1 -4
  1510. transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py +10 -2
  1511. transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +97 -99
  1512. transformers/models/xlm_roberta_xl/modular_xlm_roberta_xl.py +67 -70
  1513. transformers/models/xlnet/configuration_xlnet.py +3 -12
  1514. transformers/models/xlnet/modeling_xlnet.py +149 -162
  1515. transformers/models/xlnet/tokenization_xlnet.py +1 -4
  1516. transformers/models/xlstm/configuration_xlstm.py +8 -12
  1517. transformers/models/xlstm/modeling_xlstm.py +61 -96
  1518. transformers/models/xmod/configuration_xmod.py +11 -3
  1519. transformers/models/xmod/modeling_xmod.py +111 -116
  1520. transformers/models/yolos/configuration_yolos.py +0 -1
  1521. transformers/models/yolos/image_processing_yolos.py +60 -62
  1522. transformers/models/yolos/image_processing_yolos_fast.py +42 -45
  1523. transformers/models/yolos/modeling_yolos.py +19 -21
  1524. transformers/models/yolos/modular_yolos.py +17 -19
  1525. transformers/models/yoso/configuration_yoso.py +8 -2
  1526. transformers/models/yoso/modeling_yoso.py +60 -62
  1527. transformers/models/youtu/__init__.py +27 -0
  1528. transformers/models/youtu/configuration_youtu.py +194 -0
  1529. transformers/models/youtu/modeling_youtu.py +619 -0
  1530. transformers/models/youtu/modular_youtu.py +254 -0
  1531. transformers/models/zamba/configuration_zamba.py +5 -8
  1532. transformers/models/zamba/modeling_zamba.py +93 -125
  1533. transformers/models/zamba2/configuration_zamba2.py +44 -50
  1534. transformers/models/zamba2/modeling_zamba2.py +137 -165
  1535. transformers/models/zamba2/modular_zamba2.py +79 -74
  1536. transformers/models/zoedepth/configuration_zoedepth.py +17 -41
  1537. transformers/models/zoedepth/image_processing_zoedepth.py +28 -29
  1538. transformers/models/zoedepth/image_processing_zoedepth_fast.py +20 -21
  1539. transformers/models/zoedepth/modeling_zoedepth.py +19 -19
  1540. transformers/pipelines/__init__.py +47 -106
  1541. transformers/pipelines/any_to_any.py +15 -23
  1542. transformers/pipelines/audio_utils.py +1 -2
  1543. transformers/pipelines/automatic_speech_recognition.py +0 -2
  1544. transformers/pipelines/base.py +13 -17
  1545. transformers/pipelines/image_text_to_text.py +1 -2
  1546. transformers/pipelines/question_answering.py +4 -43
  1547. transformers/pipelines/text_classification.py +1 -14
  1548. transformers/pipelines/text_to_audio.py +5 -1
  1549. transformers/pipelines/token_classification.py +1 -22
  1550. transformers/pipelines/video_classification.py +1 -9
  1551. transformers/pipelines/zero_shot_audio_classification.py +0 -1
  1552. transformers/pipelines/zero_shot_classification.py +0 -6
  1553. transformers/pipelines/zero_shot_image_classification.py +0 -7
  1554. transformers/processing_utils.py +128 -137
  1555. transformers/pytorch_utils.py +2 -26
  1556. transformers/quantizers/base.py +10 -0
  1557. transformers/quantizers/quantizer_compressed_tensors.py +7 -5
  1558. transformers/quantizers/quantizer_fbgemm_fp8.py +20 -23
  1559. transformers/quantizers/quantizer_finegrained_fp8.py +14 -20
  1560. transformers/quantizers/quantizer_mxfp4.py +1 -1
  1561. transformers/quantizers/quantizer_quark.py +0 -1
  1562. transformers/quantizers/quantizer_torchao.py +3 -19
  1563. transformers/safetensors_conversion.py +11 -4
  1564. transformers/testing_utils.py +6 -65
  1565. transformers/tokenization_mistral_common.py +563 -903
  1566. transformers/tokenization_python.py +6 -4
  1567. transformers/tokenization_utils_base.py +228 -341
  1568. transformers/tokenization_utils_sentencepiece.py +5 -6
  1569. transformers/tokenization_utils_tokenizers.py +36 -7
  1570. transformers/trainer.py +30 -41
  1571. transformers/trainer_jit_checkpoint.py +1 -2
  1572. transformers/trainer_seq2seq.py +1 -1
  1573. transformers/training_args.py +414 -420
  1574. transformers/utils/__init__.py +1 -4
  1575. transformers/utils/attention_visualizer.py +1 -1
  1576. transformers/utils/auto_docstring.py +567 -18
  1577. transformers/utils/backbone_utils.py +13 -373
  1578. transformers/utils/doc.py +4 -36
  1579. transformers/utils/dummy_pt_objects.py +0 -42
  1580. transformers/utils/generic.py +70 -34
  1581. transformers/utils/import_utils.py +72 -75
  1582. transformers/utils/loading_report.py +135 -107
  1583. transformers/utils/quantization_config.py +8 -31
  1584. transformers/video_processing_utils.py +24 -25
  1585. transformers/video_utils.py +21 -23
  1586. {transformers-5.0.0rc2.dist-info → transformers-5.1.0.dist-info}/METADATA +120 -239
  1587. transformers-5.1.0.dist-info/RECORD +2092 -0
  1588. {transformers-5.0.0rc2.dist-info → transformers-5.1.0.dist-info}/WHEEL +1 -1
  1589. transformers/pipelines/deprecated/text2text_generation.py +0 -408
  1590. transformers/pipelines/image_to_text.py +0 -229
  1591. transformers-5.0.0rc2.dist-info/RECORD +0 -2042
  1592. {transformers-5.0.0rc2.dist-info → transformers-5.1.0.dist-info}/entry_points.txt +0 -0
  1593. {transformers-5.0.0rc2.dist-info → transformers-5.1.0.dist-info}/licenses/LICENSE +0 -0
  1594. {transformers-5.0.0rc2.dist-info → transformers-5.1.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1640 @@
1
+ # Copyright 2025 the HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import math
16
+ from collections.abc import Callable
17
+ from typing import Any
18
+
19
+ import numpy as np
20
+ import torch.nn as nn
21
+ import torch.nn.functional as F
22
+
23
+ from ...cache_utils import Cache
24
+ from ...configuration_utils import PreTrainedConfig
25
+ from ...feature_extraction_utils import BatchFeature
26
+ from ...generation import GenerationMixin
27
+ from ...image_utils import ImageInput
28
+ from ...modeling_outputs import BaseModelOutputWithPooling
29
+ from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
30
+ from ...processing_utils import ImagesKwargs, ProcessorMixin, Unpack
31
+ from ...tokenization_utils_base import PreTokenizedInput, TextInput
32
+ from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torch_available, logging
33
+ from ...utils.generic import check_model_inputs
34
+ from ..chameleon.modeling_chameleon import ChameleonVQVAE, ChameleonVQVAEModelOutput, ChameleonVQVAEVectorQuantizer
35
+ from ..glm4v.configuration_glm4v import Glm4vTextConfig, Glm4vVisionConfig
36
+ from ..glm4v.modeling_glm4v import (
37
+ Glm4vCausalLMOutputWithPast,
38
+ Glm4vModel,
39
+ Glm4vModelOutputWithPast,
40
+ Glm4vPreTrainedModel,
41
+ Glm4vTextModel,
42
+ Glm4vVisionAttention,
43
+ Glm4vVisionBlock,
44
+ Glm4vVisionEmbeddings,
45
+ Glm4vVisionModel,
46
+ Glm4vVisionPatchEmbed,
47
+ )
48
+ from ..glm4v_moe.modeling_glm4v_moe import Glm4vMoeTextAttention, eager_attention_forward
49
+ from ..qwen2_vl.image_processing_qwen2_vl import Qwen2VLImageProcessor
50
+ from ..qwen2_vl.image_processing_qwen2_vl_fast import Qwen2VLImageProcessorFast
51
+ from ..qwen2_vl.processing_qwen2_vl import Qwen2VLProcessorKwargs
52
+ from ..siglip.modeling_siglip import SiglipMLP
53
+
54
+
55
+ if is_torch_available():
56
+ import torch
57
+
58
+ logger = logging.get_logger(__name__)
59
+
60
+
61
+ class GlmImageVQVAEConfig(PreTrainedConfig):
62
+ r"""
63
+ This is the configuration class to store the configuration of a [`GlmImageVQModel`]. It is used to instantiate a
64
+ `GlmImageVQModel` according to the specified arguments, defining the model architecture.
65
+ Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
66
+ documentation from [`PreTrainedConfig`] for more information. Instantiating a
67
+ configuration with the defaults will yield a similar configuration to the VQModel of the
68
+ [zai-org/GLM-Image](https://huggingface.co/zai-org/GLM-Image) architecture.
69
+
70
+ Args:
71
+ embed_dim (`int`, *optional*, defaults to 2048):
72
+ Dimensionality of each embedding vector.
73
+ num_embeddings (`int`, *optional*, defaults to 16384):
74
+ Number of codebook embeddings.
75
+ latent_channels (`int`, *optional*, defaults to 1536):
76
+ Number of channels for the latent space.
77
+ in_channels (`int`, *optional*, defaults to 3):
78
+ Number of input channels.
79
+ initializer_range (`float`, *optional*, defaults to 0.02):
80
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
81
+ """
82
+
83
+ model_type = "glm_image_vqmodel"
84
+ base_config_key = "vq_config"
85
+
86
+ def __init__(
87
+ self,
88
+ embed_dim: int = 2048,
89
+ num_embeddings: int = 16384,
90
+ latent_channels: int = 1536,
91
+ in_channels: int = 3,
92
+ initializer_range=0.02,
93
+ **kwargs,
94
+ ):
95
+ super().__init__(**kwargs)
96
+ self.embed_dim = embed_dim
97
+ self.num_embeddings = num_embeddings
98
+ self.latent_channels = latent_channels
99
+ self.in_channels = in_channels
100
+ self.initializer_range = initializer_range
101
+
102
+
103
+ class GlmImageVisionConfig(Glm4vVisionConfig):
104
+ r"""
105
+ This is the configuration class to store the configuration of a [`GlmImageVisionModel`]. It is used to instantiate an GlmImageVisionModel
106
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield
107
+ a similar configuration to that of
108
+ GLM-Image [zai-org/GLM-Image](https://huggingface.co/zai-org/GLM-Image).
109
+
110
+ Args:
111
+ depth (`int`, *optional*, defaults to 40):
112
+ Number of layers (depth) in the model.
113
+ hidden_size (`int`, *optional*, defaults to 1536):
114
+ Dimensionality of the encoder layers and the pooler layer.
115
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
116
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
117
+ `"relu"`, `"selu"` and `"gelu_new"` are supported.
118
+ attention_bias (`bool`, *optional*, defaults to `True`):
119
+ Whether to add a bias to the queries, keys and values.
120
+ attention_dropout (`float`, *optional*, defaults to 0.0):
121
+ Dropout probability for attention weights.
122
+ num_heads (`int`, *optional*, defaults to 16):
123
+ Number of attention heads for each attention layer in the Transformer architecture.
124
+ in_channels (`int`, *optional*, defaults to 3):
125
+ Number of input channels.
126
+ image_size (`int` or `list[int]`, *optional*, defaults to 2048):
127
+ The size (resolution) of each image.
128
+ patch_size (`int`, *optional*, defaults to 16):
129
+ The size (resolution) of each patch.
130
+ layer_norm_eps (`float`, *optional*, defaults to 1e-06):
131
+ The epsilon used by the layer normalization layers.
132
+ spatial_merge_size (`int`, *optional*, defaults to 1):
133
+ The size used for merging spatial dimensions.
134
+ intermediate_size (`int`, *optional*, defaults to 6144):
135
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
136
+ initializer_range (`float`, *optional*, defaults to 0.02):
137
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
138
+ """
139
+
140
+ model_type = "glm_image_vision"
141
+ base_config_key = "vision_config"
142
+
143
+ def __init__(
144
+ self,
145
+ depth=40,
146
+ hidden_size=1536,
147
+ hidden_act="gelu",
148
+ attention_bias=True,
149
+ attention_dropout=0.0,
150
+ num_heads=16,
151
+ in_channels=3,
152
+ image_size=2048,
153
+ patch_size=16,
154
+ layer_norm_eps=1e-06,
155
+ spatial_merge_size=1,
156
+ intermediate_size=6144,
157
+ initializer_range=0.02,
158
+ **kwargs,
159
+ ):
160
+ super().__init__(**kwargs)
161
+ del self.out_hidden_size
162
+ del self.rms_norm_eps
163
+ del self.temporal_patch_size
164
+ self.layer_norm_eps = layer_norm_eps
165
+
166
+
167
+ class GlmImageTextConfig(Glm4vTextConfig):
168
+ r"""
169
+ This is the configuration class to store the configuration of a [`GlmImageTextModel`]. It is used to instantiate a
170
+ GLM-Image model according to the specified arguments, defining the model architecture. Instantiating a
171
+ configuration with the defaults will yield a similar configuration to that of
172
+ GLM-Image [zai-org/GLM-Image](https://huggingface.co/zai-org/GLM-Image).
173
+
174
+ Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
175
+ documentation from [`PreTrainedConfig`] for more information.
176
+
177
+ Args:
178
+ vocab_size (`int`, *optional*, defaults to 168064):
179
+ Vocabulary size of the GlmImage model. Defines the number of different tokens that can be represented by
180
+ the `inputs_ids` passed when calling [`GlmImageModel`]
181
+ hidden_size (`int`, *optional*, defaults to 4096):
182
+ Dimension of the hidden representations.
183
+ intermediate_size (`int`, *optional*, defaults to 13696):
184
+ Dimension of the MLP representations.
185
+ num_hidden_layers (`int`, *optional*, defaults to 40):
186
+ Number of hidden layers in the Transformer encoder.
187
+ num_attention_heads (`int`, *optional*, defaults to 32):
188
+ Number of attention heads for each attention layer in the Transformer encoder.
189
+ num_key_value_heads (`int`, *optional*, defaults to 2):
190
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
191
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
192
+ `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
193
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
194
+ by meanpooling all the original heads within that group. For more details checkout [this
195
+ paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`.
196
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
197
+ The non-linear activation function (function or string) in the decoder.
198
+ max_position_embeddings (`int`, *optional*, defaults to 131072):
199
+ The maximum sequence length that this model might ever be used with.
200
+ initializer_range (`float`, *optional*, defaults to 0.02):
201
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
202
+ rms_norm_eps (`float`, *optional*, defaults to 1e-05):
203
+ The epsilon used by the rms normalization layers.
204
+ use_cache (`bool`, *optional*, defaults to `True`):
205
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
206
+ relevant if `config.is_decoder=True`.
207
+ attention_dropout (`float`, *optional*, defaults to 0.0):
208
+ The dropout ratio for the attention probabilities.
209
+ rope_parameters (`RopeParameters`, *optional*):
210
+ Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
211
+ a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
212
+ with longer `max_position_embeddings`.
213
+ pad_token_id (`int`, *optional*, defaults to 167841):
214
+ The id of the padding token.
215
+ vision_vocab_size (`int`, *optional*, defaults to 16512):
216
+ Vision vocabulary size of the GlmImage model. Defines the number of different tokens that can be
217
+ represented by the `inputs_ids` passed when calling [`GlmImageVisionModel`]
218
+ attention_bias (`bool`, *optional*, defaults to `True`):
219
+ Whether to add a bias to the queries, keys and values.
220
+ eos_token_id (`int`, *optional*, defaults to 16385):
221
+ The id of the end of sequence token.
222
+
223
+ ```python
224
+ >>> from transformers import GlmImageTextModel, GlmImageConfig
225
+
226
+ >>> # Initializing a GlmImageConfig style configuration
227
+ >>> configuration = GlmImageConfig()
228
+
229
+ >>> # Initializing a model from the GlmImageConfig style configuration
230
+ >>> model = GlmImageTextModel(configuration)
231
+
232
+ >>> # Accessing the model configuration
233
+ >>> configuration = model.config
234
+ ```"""
235
+
236
+ def __init__(
237
+ self,
238
+ vocab_size: int = 168064,
239
+ max_position_embeddings: int = 131072,
240
+ vision_vocab_size: int = 16512,
241
+ attention_bias: bool = True,
242
+ pad_token_id: int = 167841,
243
+ eos_token_id: int = 16385,
244
+ **super_kwargs,
245
+ ):
246
+ super().__init__(
247
+ vocab_size=vocab_size,
248
+ max_position_embeddings=max_position_embeddings,
249
+ pad_token_id=pad_token_id,
250
+ **super_kwargs,
251
+ )
252
+ self.vision_vocab_size = vision_vocab_size
253
+ self.attention_bias = attention_bias
254
+ self.eos_token_id = eos_token_id
255
+
256
+
257
+ class GlmImageConfig(PreTrainedConfig):
258
+ r"""
259
+ This is the configuration class to store the configuration of a [`GlmImageModel`]. It is used to instantiate a
260
+ GLM-Image model according to the specified arguments, defining the model architecture. Instantiating a
261
+ configuration with the defaults will yield a similar configuration to that of
262
+ GLM-Image [zai-org/GLM-Image](https://huggingface.co/zai-org/GLM-Image) architecture.
263
+
264
+ Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
265
+ documentation from [`PreTrainedConfig`] for more information.
266
+
267
+ Args:
268
+ text_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `GlmImageTextConfig`):
269
+ The config object or dictionary of the text backbone.
270
+ vision_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `GlmImageVisionConfig`):
271
+ The config object or dictionary of the vision backbone.
272
+ vq_config (`Union[Dict, GlmImageVQVAEConfig]`, *optional*):
273
+ GlmImageVQVAEConfig instance containing the configuration for the VQ-VAE model.
274
+ image_token_id (`int`, *optional*, defaults to 167855):
275
+ The image token index to encode the image prompt.
276
+ image_start_token_id (`int`, *optional*, defaults to 16384):
277
+ The image start token index to encode the start of image.
278
+ image_end_token_id (`int`, *optional*, defaults to 16385):
279
+ The image end token index to encode the end of image.
280
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
281
+ Whether the model's input and output word embeddings should be tied.
282
+
283
+ ```python
284
+ >>> from transformers import Glm4vForConditionalGeneration, Glm4vConfig
285
+
286
+ >>> # Initializing a GLM-Image style configuration
287
+ >>> configuration = Glm4vConfig()
288
+
289
+ >>> # Initializing a model from the GLM-Image style configuration
290
+ >>> model = Glm4vForConditionalGeneration(configuration)
291
+
292
+ >>> # Accessing the model configuration
293
+ >>> configuration = model.config
294
+ ```"""
295
+
296
+ model_type = "glm_image"
297
+ sub_configs = {
298
+ "vision_config": GlmImageVisionConfig,
299
+ "text_config": GlmImageTextConfig,
300
+ "vq_config": GlmImageVQVAEConfig,
301
+ }
302
+ keys_to_ignore_at_inference = ["past_key_values"]
303
+
304
+ def __init__(
305
+ self,
306
+ text_config=None,
307
+ vision_config=None,
308
+ vq_config=None,
309
+ image_token_id=167855,
310
+ image_start_token_id=16384,
311
+ image_end_token_id=16385,
312
+ tie_word_embeddings: bool | None = False,
313
+ **kwargs,
314
+ ):
315
+ if isinstance(vision_config, dict):
316
+ vision_config = self.sub_configs["vision_config"](**vision_config)
317
+ elif vision_config is None:
318
+ vision_config = self.sub_configs["vision_config"](**kwargs)
319
+
320
+ if isinstance(vq_config, dict):
321
+ vq_config = self.sub_configs["vq_config"](**vq_config)
322
+ elif vq_config is None:
323
+ vq_config = self.sub_configs["vq_config"](**kwargs)
324
+
325
+ if isinstance(text_config, dict):
326
+ text_config = self.sub_configs["text_config"](**text_config)
327
+ elif text_config is None:
328
+ text_config = self.sub_configs["text_config"](**kwargs)
329
+
330
+ self.image_token_id = image_token_id
331
+ self.image_start_token_id = image_start_token_id
332
+ self.image_end_token_id = image_end_token_id
333
+ self.text_config = text_config
334
+ self.vision_config = vision_config
335
+ self.vq_config = vq_config
336
+ self.tie_word_embeddings = tie_word_embeddings
337
+ super().__init__(**kwargs)
338
+
339
+
340
+ class GlmImageVisionMLP(SiglipMLP):
341
+ pass
342
+
343
+
344
+ class GlmImageVisionAttention(Glm4vVisionAttention):
345
+ def __init__(self, config: GlmImageVisionConfig) -> None:
346
+ super().__init__(config)
347
+ self.qkv = nn.Linear(config.hidden_size, config.hidden_size * 3, bias=config.attention_bias)
348
+ self.proj = nn.Linear(config.hidden_size, config.hidden_size, bias=config.attention_bias)
349
+
350
+ def forward(
351
+ self,
352
+ hidden_states: torch.Tensor,
353
+ cu_seqlens: torch.Tensor,
354
+ **kwargs,
355
+ ) -> torch.Tensor:
356
+ seq_length = hidden_states.shape[0]
357
+ query_states, key_states, value_states = (
358
+ self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
359
+ )
360
+ query_states = query_states.transpose(0, 1).unsqueeze(0)
361
+ key_states = key_states.transpose(0, 1).unsqueeze(0)
362
+ value_states = value_states.transpose(0, 1).unsqueeze(0)
363
+
364
+ attention_interface: Callable = ALL_ATTENTION_FUNCTIONS.get_interface(
365
+ self.config._attn_implementation, eager_attention_forward
366
+ )
367
+
368
+ if "flash" in self.config._attn_implementation:
369
+ # Flash Attention: Use cu_seqlens for variable length attention
370
+ max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
371
+ attn_output, _ = attention_interface(
372
+ self,
373
+ query_states,
374
+ key_states,
375
+ value_states,
376
+ attention_mask=None,
377
+ scaling=self.scaling,
378
+ dropout=0.0 if not self.training else self.attention_dropout,
379
+ cu_seq_lens_q=cu_seqlens,
380
+ cu_seq_lens_k=cu_seqlens,
381
+ max_length_q=max_seqlen,
382
+ max_length_k=max_seqlen,
383
+ is_causal=False,
384
+ **kwargs,
385
+ )
386
+ else:
387
+ # Other implementations: Process each chunk separately
388
+ lengths = cu_seqlens[1:] - cu_seqlens[:-1]
389
+ splits = [
390
+ torch.split(tensor, lengths.tolist(), dim=2) for tensor in (query_states, key_states, value_states)
391
+ ]
392
+
393
+ attn_outputs = [
394
+ attention_interface(
395
+ self,
396
+ q,
397
+ k,
398
+ v,
399
+ attention_mask=None,
400
+ scaling=self.scaling,
401
+ dropout=0.0 if not self.training else self.attention_dropout,
402
+ is_causal=False,
403
+ **kwargs,
404
+ )[0]
405
+ for q, k, v in zip(*splits)
406
+ ]
407
+ attn_output = torch.cat(attn_outputs, dim=1)
408
+
409
+ attn_output = attn_output.reshape(seq_length, -1).contiguous()
410
+ attn_output = self.proj(attn_output)
411
+ return attn_output
412
+
413
+
414
+ class GlmImageVisionPatchEmbed(Glm4vVisionPatchEmbed):
415
+ def __init__(self, config: GlmImageVisionConfig) -> None:
416
+ super().__init__(config)
417
+
418
+ del self.temporal_patch_size
419
+ kernel_size = [self.patch_size, self.patch_size]
420
+ self.proj = nn.Conv2d(self.in_channels, self.embed_dim, kernel_size=kernel_size, stride=kernel_size)
421
+
422
+ def forward(self, hidden_states):
423
+ target_dtype = self.proj.weight.dtype
424
+ hidden_states = hidden_states.view(-1, self.in_channels, self.patch_size, self.patch_size)
425
+ hidden_states = self.proj(hidden_states.to(dtype=target_dtype)).view(-1, self.embed_dim)
426
+ return hidden_states
427
+
428
+
429
+ class GlmImageVisionEmbeddings(Glm4vVisionEmbeddings):
430
+ def __init__(self, config: GlmImageVisionConfig) -> None:
431
+ super().__init__(config)
432
+ self.interpolated_method = "bilinear"
433
+
434
+
435
+ class GlmImageVisionBlock(Glm4vVisionBlock):
436
+ def __init__(self, config: GlmImageVisionConfig):
437
+ super().__init__(config)
438
+ self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
439
+ self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
440
+ self.attn = GlmImageVisionAttention(config)
441
+ self.mlp = GlmImageVisionMLP(config)
442
+
443
+ def forward(
444
+ self,
445
+ hidden_states: torch.Tensor,
446
+ cu_seqlens: torch.Tensor,
447
+ **kwargs: Unpack[TransformersKwargs],
448
+ ) -> torch.Tensor:
449
+ r"""
450
+ cu_seqlens (`torch.Tensor` of shape `(num_images_or_videos + 1,)`):
451
+ The cumulative sequence lengths of each image or video feature.
452
+ position_embeddings (`tuple(torch.Tensor, torch.Tensor)` of shape `(num_patches, head_dim // 2)`):
453
+ The cosine and sine position embeddings for vision attention.
454
+ """
455
+ residual = hidden_states
456
+
457
+ hidden_states = self.norm1(hidden_states)
458
+ hidden_states = self.attn(
459
+ hidden_states,
460
+ cu_seqlens=cu_seqlens,
461
+ **kwargs,
462
+ )
463
+ hidden_states = residual + hidden_states
464
+
465
+ residual = hidden_states
466
+ hidden_states = self.norm2(hidden_states)
467
+ hidden_states = self.mlp(hidden_states)
468
+ hidden_states = residual + hidden_states
469
+
470
+ return hidden_states
471
+
472
+
473
+ class GlmImageTextAttention(Glm4vMoeTextAttention):
474
+ pass
475
+
476
+
477
+ class GlmImagePreTrainedModel(Glm4vPreTrainedModel):
478
+ config: GlmImageConfig
479
+ input_modalities = ("image", "text")
480
+
481
+ @torch.no_grad()
482
+ def _init_weights(self, module):
483
+ PreTrainedModel._init_weights(module)
484
+
485
+
486
+ class GlmImageModelOutputWithPast(Glm4vModelOutputWithPast):
487
+ pass
488
+
489
+
490
+ class GlmImageVQVAEVectorQuantizer(ChameleonVQVAEVectorQuantizer):
491
+ def __init__(self, config: GlmImageVQVAEConfig):
492
+ super().__init__(config)
493
+ self.num_embeddings = config.num_embeddings
494
+ self.embedding_dim = config.embed_dim
495
+ self.beta = getattr(config, "beta", 0.25)
496
+
497
+ self.embedding = nn.Embedding(self.num_embeddings, self.embedding_dim)
498
+
499
+ def forward(self, hidden_state: torch.Tensor):
500
+ hidden_state = hidden_state.permute(0, 2, 3, 1).contiguous()
501
+ hidden_state_flattened = hidden_state.view(-1, self.embedding_dim)
502
+
503
+ # L2 normalize
504
+ hidden_state = F.normalize(hidden_state, p=2, dim=-1)
505
+ hidden_state_flattened = F.normalize(hidden_state_flattened, p=2, dim=-1)
506
+ embedding = F.normalize(self.embedding.weight, p=2, dim=-1)
507
+
508
+ # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
509
+ distances = (
510
+ torch.sum(hidden_state_flattened**2, dim=1, keepdim=True)
511
+ + torch.sum(embedding**2, dim=1)
512
+ - 2 * torch.einsum("bd,dn->bn", hidden_state_flattened, embedding.transpose(0, 1))
513
+ )
514
+
515
+ min_encoding_indices = torch.argmin(distances, dim=1)
516
+ hidden_state_quant = embedding[min_encoding_indices].view(hidden_state.shape)
517
+
518
+ # compute loss for embedding
519
+ loss = torch.mean((hidden_state_quant.detach() - hidden_state) ** 2) + self.beta * torch.mean(
520
+ (hidden_state_quant - hidden_state.detach()) ** 2
521
+ )
522
+
523
+ # preserve gradients
524
+ hidden_state_quant = hidden_state + (hidden_state_quant - hidden_state).detach()
525
+
526
+ # reshape back to match original input shape
527
+ hidden_state_quant = hidden_state_quant.permute(0, 3, 1, 2).contiguous()
528
+
529
+ return hidden_state_quant, loss, min_encoding_indices
530
+
531
+
532
+ class GlmImageVQVAEModelOutput(ChameleonVQVAEModelOutput):
533
+ pass
534
+
535
+
536
+ class GlmImageVQVAE(ChameleonVQVAE):
537
+ _no_split_modules = [
538
+ "GlmImageVQVAEVectorQuantizer",
539
+ ]
540
+ _can_record_outputs = {}
541
+
542
+ def __init__(self, config: GlmImageVQVAEConfig):
543
+ super().__init__(config)
544
+ del self.encoder
545
+
546
+ def encode(self, hidden_states):
547
+ conv_hidden_states = self.quant_conv(hidden_states)
548
+ quantized_last_hidden_state, emb_loss, indices = self.quantize(conv_hidden_states)
549
+ return GlmImageVQVAEModelOutput(
550
+ last_hidden_state=hidden_states,
551
+ quantized_last_hidden_state=quantized_last_hidden_state,
552
+ image_tokens=indices,
553
+ embedding_loss=emb_loss,
554
+ )
555
+
556
+
557
+ class GlmImageVisionModel(Glm4vVisionModel):
558
+ config: GlmImageVisionConfig
559
+ main_input_name = "pixel_values"
560
+ input_modalities = ("image",)
561
+
562
+ def __init__(self, config: GlmImageVisionConfig):
563
+ super().__init__(config)
564
+
565
+ head_dim = config.hidden_size // config.num_heads
566
+ self.head_dim = head_dim
567
+
568
+ del self.merger
569
+ del self.rotary_pos_emb
570
+ del self.post_conv_layernorm
571
+ del self.downsample
572
+ del self.post_layernorm
573
+
574
+ def rot_pos_emb(self, grid_thw):
575
+ pos_ids = []
576
+ for t, h, w in grid_thw:
577
+ hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
578
+ hpos_ids = hpos_ids.reshape(
579
+ h // self.spatial_merge_size,
580
+ self.spatial_merge_size,
581
+ w // self.spatial_merge_size,
582
+ self.spatial_merge_size,
583
+ )
584
+ hpos_ids = hpos_ids.permute(0, 2, 1, 3)
585
+ hpos_ids = hpos_ids.flatten()
586
+
587
+ wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
588
+ wpos_ids = wpos_ids.reshape(
589
+ h // self.spatial_merge_size,
590
+ self.spatial_merge_size,
591
+ w // self.spatial_merge_size,
592
+ self.spatial_merge_size,
593
+ )
594
+ wpos_ids = wpos_ids.permute(0, 2, 1, 3)
595
+ wpos_ids = wpos_ids.flatten()
596
+ pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
597
+ pos_ids = torch.cat(pos_ids, dim=0)
598
+ return pos_ids
599
+
600
+ @check_model_inputs
601
+ @auto_docstring
602
+ def forward(
603
+ self, pixel_values: torch.Tensor, grid_thw: torch.Tensor, **kwargs: Unpack[TransformersKwargs]
604
+ ) -> tuple | BaseModelOutputWithPooling:
605
+ r"""
606
+ pixel_values (`torch.Tensor` of shape `(total_patches, num_channels * patch_size * patch_size)`):
607
+ Packed pixel values.
608
+ grid_thw (`torch.Tensor` of shape `(num_images, 3)`):
609
+ The temporal, height and width of feature shape of each image.
610
+
611
+ Returns:
612
+ `torch.Tensor` of shape `(total_patches, hidden_size)`: Hidden states.
613
+ """
614
+
615
+ hidden_states = self.patch_embed(pixel_values)
616
+ image_type_ids = self.rot_pos_emb(grid_thw)
617
+
618
+ cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
619
+ dim=0,
620
+ dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
621
+ )
622
+ cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
623
+ seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
624
+ hidden_states = self.embeddings(
625
+ hidden_states,
626
+ seqlens,
627
+ grid_thw,
628
+ image_type_ids[:, 0].to(hidden_states.device),
629
+ image_type_ids[:, 1].to(hidden_states.device),
630
+ )
631
+
632
+ # Transformer blocks (no position_embeddings needed, already added above)
633
+ for blk in self.blocks:
634
+ hidden_states = blk(
635
+ hidden_states,
636
+ cu_seqlens=cu_seqlens,
637
+ )
638
+
639
+ return BaseModelOutputWithPooling(last_hidden_state=hidden_states)
640
+
641
+
642
+ class GlmImageTextModel(Glm4vTextModel):
643
+ pass
644
+
645
+
646
+ class GlmImageModel(Glm4vModel):
647
+ def __init__(self, config):
648
+ super().__init__(config)
649
+ self.visual = GlmImageVisionModel._from_config(config.vision_config)
650
+ self.language_model = GlmImageTextModel._from_config(config.text_config)
651
+ self.vqmodel = GlmImageVQVAE._from_config(config.vq_config)
652
+
653
+ self.rope_deltas = None # cache rope_deltas here
654
+
655
+ # Per-sample caches for batch processing
656
+ self._cached_decode_position_ids = None # shape: [batch_size, 3, max_decode_len]
657
+ self._prefill_len = None # prefill sequence length (same for all samples in batch)
658
+
659
+ # Initialize weights and apply final processing
660
+ self.post_init()
661
+
662
+ def get_rope_index(
663
+ self,
664
+ input_ids: torch.LongTensor | None = None,
665
+ image_grid_thw: torch.LongTensor | None = None,
666
+ images_per_sample: torch.LongTensor | None = None,
667
+ attention_mask: torch.LongTensor | None = None,
668
+ ) -> tuple[torch.Tensor, torch.Tensor]:
669
+ """
670
+ Calculate the 3D rope index for image generation task with full batch support.
671
+
672
+ Args:
673
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
674
+ Indices of input sequence tokens in the vocabulary.
675
+ image_grid_thw (`torch.LongTensor` of shape `(total_images_in_batch, 3)`, *optional*):
676
+ The temporal, height and width of feature shape of each image.
677
+ Images are packed across all samples in the batch.
678
+ images_per_sample (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
679
+ Number of images (including target grids) for each sample in the batch.
680
+ Used to split image_grid_thw by sample.
681
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
682
+ Mask to avoid performing attention on padding token indices.
683
+
684
+ Returns:
685
+ position_ids (`torch.LongTensor` of shape `(3, batch_size, sequence_length)`):
686
+ Position IDs for temporal, height, and width dimensions.
687
+ mrope_position_deltas (`torch.Tensor` of shape `(batch_size, 1)`):
688
+ Position deltas for multi-modal rotary position embedding.
689
+ """
690
+ batch_size, seq_len = input_ids.shape
691
+ device = input_ids.device
692
+ dtype = input_ids.dtype
693
+
694
+ image_start_token_id = self.config.image_start_token_id
695
+ image_end_token_id = self.config.image_end_token_id
696
+
697
+ position_ids = torch.ones(3, batch_size, seq_len, dtype=dtype, device=device)
698
+ text_positions = torch.arange(seq_len, device=device)[None, :].repeat(3, 1)
699
+
700
+ # Split image_grid_thw by sample if images_per_sample is provided
701
+ if image_grid_thw is not None and images_per_sample is not None:
702
+ grids_per_sample = torch.split(image_grid_thw, images_per_sample.tolist())
703
+ elif image_grid_thw is not None:
704
+ # Fallback: assume all grids belong to first sample (batch_size=1)
705
+ grids_per_sample = [image_grid_thw] * batch_size
706
+ else:
707
+ grids_per_sample = [None] * batch_size
708
+
709
+ # Per-sample caches for decode stage
710
+ all_decode_position_ids = []
711
+
712
+ for batch_idx in range(batch_size):
713
+ curr_input_ids = input_ids[batch_idx]
714
+ curr_grids = grids_per_sample[batch_idx]
715
+
716
+ if attention_mask is not None and attention_mask.shape[1] == seq_len:
717
+ valid_mask = attention_mask[batch_idx] == 1
718
+ curr_input_ids_valid = curr_input_ids[valid_mask]
719
+ else:
720
+ # attention_mask may have different length during assisted decoding
721
+ curr_input_ids_valid = curr_input_ids
722
+ valid_mask = None
723
+
724
+ # Find image boundaries in this sample
725
+ image_end_positions = torch.where(curr_input_ids_valid == image_end_token_id)[0]
726
+ image_start_positions = torch.where(curr_input_ids_valid == image_start_token_id)[0] + 1
727
+ num_complete_images = len(image_end_positions)
728
+
729
+ current_pos = 0
730
+ prev_image_end = 0
731
+ curr_position_ids = []
732
+
733
+ # Process complete images (source images in image-to-image task)
734
+ for img_idx, (start, end) in enumerate(zip(image_start_positions, image_end_positions)):
735
+ if curr_grids is None or img_idx >= len(curr_grids):
736
+ break
737
+ grid = curr_grids[img_idx]
738
+ # grid format is [temporal, height, width]
739
+ _, height, width = grid.tolist()
740
+
741
+ # Text tokens before this image
742
+ llm_pos_length = start - prev_image_end
743
+ llm_position_ids = text_positions[:, current_pos : current_pos + llm_pos_length].to(device=device)
744
+ current_pos += llm_position_ids.shape[-1]
745
+
746
+ # Image tokens with 2D spatial encoding
747
+ # For an image with height H and width W:
748
+ # - position_width cycles [0, 1, ..., W-1] for each row, repeated H times
749
+ # - position_height stays constant per row, [0]*W, [1]*W, ..., [H-1]*W
750
+ image_seq_length = height * width
751
+ position_width = torch.arange(current_pos, current_pos + width, device=device).repeat(height)
752
+ position_height = torch.arange(current_pos, current_pos + height, device=device).repeat_interleave(
753
+ width
754
+ )
755
+ position_temporal = torch.full((image_seq_length,), current_pos, device=device, dtype=torch.long)
756
+ vision_position_ids = torch.stack([position_temporal, position_height, position_width], dim=0)
757
+ current_pos += max(height, width)
758
+
759
+ prev_image_end = end
760
+ curr_position_ids.append(torch.cat([llm_position_ids, vision_position_ids], dim=-1))
761
+
762
+ # Remaining text tokens (including the final image_start token for generation)
763
+ end_position = len(curr_input_ids_valid) - prev_image_end
764
+ llm_position_ids = text_positions[:, current_pos : current_pos + end_position].to(device=device)
765
+ current_pos += llm_position_ids.shape[-1]
766
+ curr_position_ids.append(llm_position_ids)
767
+
768
+ # Concatenate all position ids for this sample
769
+ curr_position_ids = torch.cat(curr_position_ids, dim=-1)
770
+
771
+ # Store in the main position_ids tensor
772
+ if valid_mask is not None:
773
+ position_ids[:, batch_idx, valid_mask] = curr_position_ids
774
+ else:
775
+ position_ids[:, batch_idx, :] = curr_position_ids
776
+
777
+ # Build decode position ids for this sample
778
+ if curr_grids is not None and len(curr_grids) > 0:
779
+ num_decode_grids = len(curr_grids) - num_complete_images
780
+ num_decode_grids = max(num_decode_grids, 0)
781
+ decode_pos = current_pos
782
+
783
+ decode_temporal_list = []
784
+ decode_height_list = []
785
+ decode_width_list = []
786
+
787
+ for i in range(1, num_decode_grids + 1):
788
+ grid_idx = -i
789
+ h = curr_grids[grid_idx, 1].item()
790
+ w = curr_grids[grid_idx, 2].item()
791
+ total_tokens = h * w
792
+
793
+ h_indices = torch.arange(h, device=device).unsqueeze(1).expand(h, w).flatten()
794
+ w_indices = torch.arange(w, device=device).unsqueeze(0).expand(h, w).flatten()
795
+
796
+ decode_temporal_list.append(
797
+ torch.full((total_tokens,), decode_pos, device=device, dtype=torch.long)
798
+ )
799
+ decode_height_list.append(decode_pos + h_indices)
800
+ decode_width_list.append(decode_pos + w_indices)
801
+ decode_pos = decode_pos + max(h, w)
802
+
803
+ # End marker
804
+ decode_temporal_list.append(torch.tensor([decode_pos], device=device, dtype=torch.long))
805
+ decode_height_list.append(torch.tensor([decode_pos], device=device, dtype=torch.long))
806
+ decode_width_list.append(torch.tensor([decode_pos], device=device, dtype=torch.long))
807
+
808
+ sample_decode_pos_ids = torch.stack(
809
+ [
810
+ torch.cat(decode_temporal_list, dim=0),
811
+ torch.cat(decode_height_list, dim=0),
812
+ torch.cat(decode_width_list, dim=0),
813
+ ],
814
+ dim=0,
815
+ )
816
+ all_decode_position_ids.append(sample_decode_pos_ids)
817
+
818
+ # Store prefill length (same for all samples since input_ids is padded to same length)
819
+ self._prefill_len = seq_len
820
+
821
+ # Pad decode position ids to same length and stack
822
+ if all_decode_position_ids:
823
+ max_decode_len = max(x.shape[1] for x in all_decode_position_ids)
824
+ padded_decode_pos_ids = [
825
+ F.pad(pos_ids, (0, max_decode_len - pos_ids.shape[1]), mode="replicate")
826
+ for pos_ids in all_decode_position_ids
827
+ ]
828
+ self._cached_decode_position_ids = torch.stack(padded_decode_pos_ids, dim=0) # [batch, 3, max_decode_len]
829
+ else:
830
+ self._cached_decode_position_ids = None
831
+
832
+ mrope_position_deltas = torch.zeros([batch_size, 1], dtype=dtype, device=device)
833
+
834
+ return position_ids, mrope_position_deltas
835
+
836
+ def get_image_tokens(
837
+ self,
838
+ hidden_states: torch.FloatTensor,
839
+ image_grid_thw: torch.LongTensor,
840
+ ) -> torch.LongTensor:
841
+ """
842
+ Tokenizes image features into discrete tokens with VQVAE module.
843
+
844
+ Args:
845
+ hidden_states (`torch.FloatTensor` of shape `(total_patches, hidden_size)`):
846
+ The packed image features from vision encoder.
847
+ image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`):
848
+ The temporal, height and width of feature shape of each image.
849
+
850
+ Returns:
851
+ image_tokens (`torch.LongTensor` of shape `(total_patches,)`):
852
+ Discrete token indices from the VQVAE codebook.
853
+ """
854
+ hidden_size = hidden_states.shape[-1]
855
+ split_sizes = (image_grid_thw.prod(dim=-1)).tolist()
856
+ hidden_states_list = torch.split(hidden_states, split_sizes, dim=0)
857
+
858
+ all_image_toks = []
859
+ for i, hs in enumerate(hidden_states_list):
860
+ grid_t, grid_h, grid_w = image_grid_thw[i].tolist()
861
+ hs = hs.view(grid_t, grid_h, grid_w, hidden_size)
862
+ hs = hs.permute(0, 3, 1, 2).contiguous()
863
+ vqmodel_outputs: GlmImageVQVAEModelOutput = self.vqmodel.encode(hs)
864
+ all_image_toks.append(vqmodel_outputs.image_tokens)
865
+ return torch.cat(all_image_toks, dim=0)
866
+
867
+ def get_video_features(self):
868
+ raise AttributeError("Not needed for GlmImage")
869
+
870
+ @can_return_tuple
871
+ @auto_docstring
872
+ def get_image_features(
873
+ self,
874
+ pixel_values: torch.FloatTensor,
875
+ image_grid_thw: torch.LongTensor | None = None,
876
+ **kwargs: Unpack[TransformersKwargs],
877
+ ) -> tuple | BaseModelOutputWithPooling:
878
+ r"""
879
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
880
+ The tensors corresponding to the input images.
881
+ image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
882
+ The temporal, height and width of feature shape of each image in LLM.
883
+ """
884
+ pixel_values = pixel_values.type(self.visual.dtype)
885
+ vision_outputs = self.visual(pixel_values, grid_thw=image_grid_thw, return_dict=True, **kwargs)
886
+ split_sizes = (image_grid_thw.prod(-1) // self.visual.spatial_merge_size**2).tolist()
887
+ image_embeds = torch.split(vision_outputs.last_hidden_state, split_sizes)
888
+ vision_outputs.pooler_output = image_embeds
889
+
890
+ return vision_outputs
891
+
892
+ def get_placeholder_mask(
893
+ self,
894
+ input_ids: torch.LongTensor,
895
+ image_ids: torch.LongTensor,
896
+ ):
897
+ """
898
+ Replace image placeholder tokens in input_ids with actual image token ids from VQVAE.
899
+
900
+ Args:
901
+ input_ids (`torch.LongTensor` of shape `(batch_size, seq_len)`):
902
+ Input token ids with image placeholders.
903
+ image_ids (`torch.LongTensor` of shape `(num_images, num_tokens_per_image)` or flattened):
904
+ Discrete token indices from the VQVAE codebook.
905
+
906
+ Returns:
907
+ special_image_mask (`torch.LongTensor` of shape `(batch_size, seq_len)`):
908
+ Mask indicating positions in input ids that will be replaced by actual image tokens.
909
+ """
910
+
911
+ special_image_mask = input_ids == self.config.image_token_id
912
+ n_placeholder_tokens = special_image_mask.sum().item()
913
+ n_image_tokens = image_ids.shape[0]
914
+
915
+ if n_placeholder_tokens != n_image_tokens:
916
+ raise ValueError(
917
+ f"Number of image placeholder tokens ({n_placeholder_tokens}) does not match "
918
+ f"number of image tokens from VQVAE ({n_image_tokens})"
919
+ )
920
+
921
+ return special_image_mask
922
+
923
+ def forward(
924
+ self,
925
+ input_ids: torch.LongTensor | None = None,
926
+ attention_mask: torch.Tensor | None = None,
927
+ position_ids: torch.LongTensor | None = None,
928
+ past_key_values: Cache | None = None,
929
+ inputs_embeds: torch.FloatTensor | None = None,
930
+ pixel_values: torch.Tensor | None = None,
931
+ image_grid_thw: torch.LongTensor | None = None,
932
+ images_per_sample: torch.LongTensor | None = None,
933
+ rope_deltas: torch.LongTensor | None = None,
934
+ cache_position: torch.LongTensor | None = None,
935
+ **kwargs: Unpack[TransformersKwargs],
936
+ ) -> tuple | GlmImageModelOutputWithPast:
937
+ r"""
938
+ image_grid_thw (`torch.LongTensor` of shape `(total_images_in_batch, 3)`, *optional*):
939
+ The temporal, height and width of feature shape of each image in LLM.
940
+ Images are packed across all samples in the batch.
941
+ images_per_sample (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
942
+ Number of images (including target grids) for each sample in the batch.
943
+ rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
944
+ The rope index difference between sequence length and multimodal rope.
945
+ """
946
+ if (input_ids is None) ^ (inputs_embeds is not None):
947
+ raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
948
+
949
+ batch_size = input_ids.shape[0] if input_ids is not None else inputs_embeds.shape[0]
950
+
951
+ if pixel_values is not None:
952
+ # Process source images (image-to-image mode)
953
+ # Source images are identified by counting image_end_token_id in input_ids
954
+ # Note: We must exclude padding tokens since pad_token_id == image_end_token_id
955
+ if images_per_sample is not None:
956
+ grids_per_sample = torch.split(image_grid_thw, images_per_sample.tolist())
957
+ # Create mask for non-padding tokens (attention_mask=1 means non-padding)
958
+ # Handle 4D attention mask (from static cache) by extracting diagonal
959
+ if attention_mask is not None and attention_mask.ndim == 4:
960
+ non_pad_mask = torch.diagonal(attention_mask[:, 0], dim1=1, dim2=2)
961
+ if non_pad_mask.dtype.is_floating_point:
962
+ non_pad_mask = non_pad_mask / torch.finfo(non_pad_mask.dtype).min
963
+ non_pad_mask = (1.0 - non_pad_mask).int()
964
+ # Only keep columns matching input_ids length
965
+ non_pad_mask = non_pad_mask[:, -input_ids.shape[1] :]
966
+ else:
967
+ non_pad_mask = attention_mask if attention_mask is not None else torch.ones_like(input_ids)
968
+
969
+ source_grids_list = []
970
+ for sample_idx in range(batch_size):
971
+ is_image_end = input_ids[sample_idx] == self.config.image_end_token_id
972
+ is_non_pad = non_pad_mask[sample_idx] == 1
973
+ num_source = (is_image_end & is_non_pad).sum().item()
974
+ if num_source > 0:
975
+ source_grids_list.append(grids_per_sample[sample_idx][:num_source])
976
+ if len(source_grids_list) == 0:
977
+ raise ValueError(
978
+ "pixel_values provided but no source images found in input_ids. "
979
+ "Ensure input_ids contains image_end_token_id for each source image."
980
+ )
981
+ source_grids = torch.cat(source_grids_list, dim=0)
982
+ else:
983
+ # Fallback for batch_size=1: all but last grid are source images
984
+ source_grids = image_grid_thw[:-1]
985
+
986
+ image_features = self.get_image_features(pixel_values, source_grids, return_dict=True)
987
+ image_embeds = torch.cat(image_features.pooler_output, dim=0)
988
+ image_ids = self.get_image_tokens(image_embeds, source_grids)
989
+ image_ids = image_ids.view(-1).to(input_ids.device)
990
+ special_image_mask = self.get_placeholder_mask(input_ids, image_ids)
991
+ input_ids = input_ids.masked_scatter(special_image_mask, image_ids)
992
+
993
+ if inputs_embeds is None:
994
+ inputs_embeds = self.get_input_embeddings()(input_ids)
995
+
996
+ if position_ids is None:
997
+ attention_mask_2d = attention_mask
998
+ if attention_mask is not None and attention_mask.ndim == 4:
999
+ attention_mask_2d = torch.diagonal(attention_mask[:, 0], dim1=1, dim2=2)
1000
+ # Only apply conversion for floating point tensors (inverted masks)
1001
+ if attention_mask_2d.dtype.is_floating_point:
1002
+ attention_mask_2d = attention_mask_2d / torch.finfo(attention_mask_2d.dtype).min
1003
+ attention_mask_2d = (1.0 - attention_mask_2d).int()
1004
+
1005
+ # Calculate RoPE index once per generation in the pre-fill stage only.
1006
+ is_prefill_stage = (input_ids is not None and input_ids.shape[1] != 1) or (
1007
+ inputs_embeds is not None and inputs_embeds.shape[1] != 1
1008
+ )
1009
+ if is_prefill_stage or self.rope_deltas is None:
1010
+ position_ids, rope_deltas = self.get_rope_index(
1011
+ input_ids,
1012
+ image_grid_thw,
1013
+ images_per_sample=images_per_sample,
1014
+ attention_mask=attention_mask_2d,
1015
+ )
1016
+ self.rope_deltas = rope_deltas
1017
+ # then use the prev pre-calculated rope-deltas to get the correct position ids
1018
+ else:
1019
+ batch_size, seq_length, _ = inputs_embeds.shape
1020
+ # Per-sample decode position lookup
1021
+ # _cached_decode_position_ids shape: [batch_size, 3, max_decode_len]
1022
+ if self._cached_decode_position_ids is not None:
1023
+ step = cache_position[0].item() - self._prefill_len
1024
+ # Get position ids for all samples at once, then transpose to [3, batch_size, seq_length]
1025
+ position_ids = self._cached_decode_position_ids[:, :, step : step + seq_length].permute(1, 0, 2)
1026
+ else:
1027
+ # Fallback for text-to-image or cases without cached decode positions
1028
+ # Use simple incremental positions
1029
+ start_pos = cache_position[0].item()
1030
+ position_ids = torch.arange(
1031
+ start_pos, start_pos + seq_length, device=inputs_embeds.device, dtype=torch.long
1032
+ )
1033
+ position_ids = position_ids.unsqueeze(0).repeat(3, batch_size, 1)
1034
+
1035
+ outputs = self.language_model(
1036
+ input_ids=None,
1037
+ position_ids=position_ids,
1038
+ attention_mask=attention_mask,
1039
+ past_key_values=past_key_values,
1040
+ inputs_embeds=inputs_embeds,
1041
+ cache_position=cache_position,
1042
+ **kwargs,
1043
+ )
1044
+
1045
+ return GlmImageModelOutputWithPast(
1046
+ last_hidden_state=outputs.last_hidden_state,
1047
+ past_key_values=outputs.past_key_values,
1048
+ hidden_states=outputs.hidden_states,
1049
+ attentions=outputs.attentions,
1050
+ rope_deltas=self.rope_deltas,
1051
+ )
1052
+
1053
+
1054
+ class GlmImageCausalLMOutputWithPast(Glm4vCausalLMOutputWithPast):
1055
+ pass
1056
+
1057
+
1058
+ class GlmImageForConditionalGeneration(GlmImagePreTrainedModel, GenerationMixin):
1059
+ _checkpoint_conversion_mapping = {}
1060
+ _tied_weights_keys = {}
1061
+ # Reference: fix gemma3 grad acc #37208
1062
+ accepts_loss_kwargs = False
1063
+ base_model_prefix = "model"
1064
+ config: GlmImageConfig
1065
+
1066
+ def __init__(self, config):
1067
+ super().__init__(config)
1068
+ self.model = GlmImageModel(config)
1069
+ self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vision_vocab_size, bias=False)
1070
+
1071
+ # Initialize weights and apply final processing
1072
+ self.post_init()
1073
+
1074
+ @auto_docstring
1075
+ def get_image_features(
1076
+ self,
1077
+ pixel_values: torch.FloatTensor,
1078
+ image_grid_thw: torch.LongTensor | None = None,
1079
+ **kwargs: Unpack[TransformersKwargs],
1080
+ ) -> tuple | BaseModelOutputWithPooling:
1081
+ r"""
1082
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
1083
+ The tensors corresponding to the input images.
1084
+ image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
1085
+ The temporal, height and width of feature shape of each image in LLM.
1086
+ """
1087
+ return self.model.get_image_features(pixel_values, image_grid_thw, **kwargs)
1088
+
1089
+ def get_image_tokens(self, hidden_states: torch.FloatTensor, image_grid_thw: torch.LongTensor | None = None):
1090
+ return self.model.get_image_tokens(hidden_states, image_grid_thw)
1091
+
1092
+ def forward(
1093
+ self,
1094
+ input_ids: torch.LongTensor | None = None,
1095
+ attention_mask: torch.Tensor | None = None,
1096
+ position_ids: torch.LongTensor | None = None,
1097
+ past_key_values: Cache | None = None,
1098
+ inputs_embeds: torch.FloatTensor | None = None,
1099
+ labels: torch.LongTensor | None = None,
1100
+ pixel_values: torch.Tensor | None = None,
1101
+ image_grid_thw: torch.LongTensor | None = None,
1102
+ images_per_sample: torch.LongTensor | None = None,
1103
+ cache_position: torch.LongTensor | None = None,
1104
+ logits_to_keep: int | torch.Tensor = 0,
1105
+ **kwargs: Unpack[TransformersKwargs],
1106
+ ) -> tuple | GlmImageCausalLMOutputWithPast:
1107
+ r"""
1108
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1109
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
1110
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
1111
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
1112
+ image_grid_thw (`torch.LongTensor` of shape `(total_images_in_batch, 3)`, *optional*):
1113
+ The temporal, height and width of feature shape of each image in LLM.
1114
+ Images are packed across all samples in the batch.
1115
+ images_per_sample (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1116
+ Number of images (including target grids) for each sample in the batch.
1117
+
1118
+ Example:
1119
+
1120
+ ```python
1121
+ >>> from PIL import Image
1122
+ >>> import httpx
1123
+ >>> from io import BytesIO
1124
+ >>> from transformers import AutoProcessor, GlmImageForConditionalGeneration
1125
+
1126
+ >>> model = GlmImageForConditionalGeneration.from_pretrained("zai-org/GLM-Image")
1127
+ >>> processor = AutoProcessor.from_pretrained("zai-org/GLM-Image")
1128
+
1129
+ >>> messages = [
1130
+ {
1131
+ "role": "user",
1132
+ "content": [
1133
+ {"type": "image"},
1134
+ {"type": "text", "text": "Add a truck of this photo.<sop>28 40<eop>"},
1135
+ ],
1136
+ },
1137
+ ]
1138
+ >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
1139
+ >>> with httpx.stream("GET", url) as response:
1140
+ ... image = Image.open(BytesIO(response.read()))
1141
+
1142
+ >>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
1143
+ >>> inputs = processor(text=[text], images=[image], vision_infos=[vision_infos])
1144
+
1145
+ >>> # Generate
1146
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
1147
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
1148
+ "The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..."
1149
+ ```"""
1150
+ outputs = self.model(
1151
+ input_ids=input_ids,
1152
+ pixel_values=pixel_values,
1153
+ image_grid_thw=image_grid_thw,
1154
+ images_per_sample=images_per_sample,
1155
+ position_ids=position_ids,
1156
+ attention_mask=attention_mask,
1157
+ past_key_values=past_key_values,
1158
+ inputs_embeds=inputs_embeds,
1159
+ cache_position=cache_position,
1160
+ **kwargs,
1161
+ )
1162
+
1163
+ hidden_states = outputs[0]
1164
+
1165
+ # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
1166
+ slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
1167
+ logits = self.lm_head(hidden_states[:, slice_indices, :])
1168
+
1169
+ loss = None
1170
+ if labels is not None:
1171
+ loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size)
1172
+
1173
+ return GlmImageCausalLMOutputWithPast(
1174
+ loss=loss,
1175
+ logits=logits,
1176
+ past_key_values=outputs.past_key_values,
1177
+ hidden_states=outputs.hidden_states,
1178
+ attentions=outputs.attentions,
1179
+ rope_deltas=outputs.rope_deltas,
1180
+ )
1181
+
1182
+ def prepare_inputs_for_generation(
1183
+ self,
1184
+ input_ids,
1185
+ past_key_values=None,
1186
+ attention_mask=None,
1187
+ inputs_embeds=None,
1188
+ cache_position=None,
1189
+ position_ids=None,
1190
+ use_cache=True,
1191
+ pixel_values=None,
1192
+ image_grid_thw=None,
1193
+ images_per_sample=None,
1194
+ is_first_iteration=False,
1195
+ **kwargs,
1196
+ ):
1197
+ model_inputs = super().prepare_inputs_for_generation(
1198
+ input_ids,
1199
+ past_key_values=past_key_values,
1200
+ attention_mask=attention_mask,
1201
+ inputs_embeds=inputs_embeds,
1202
+ cache_position=cache_position,
1203
+ position_ids=position_ids,
1204
+ pixel_values=pixel_values,
1205
+ image_grid_thw=image_grid_thw,
1206
+ is_first_iteration=is_first_iteration,
1207
+ use_cache=use_cache,
1208
+ **kwargs,
1209
+ )
1210
+
1211
+ model_inputs["position_ids"] = None
1212
+ model_inputs["images_per_sample"] = images_per_sample
1213
+
1214
+ if not is_first_iteration and use_cache:
1215
+ model_inputs["pixel_values"] = None
1216
+
1217
+ return model_inputs
1218
+
1219
+ def _get_image_nums(
1220
+ self,
1221
+ input_ids: torch.LongTensor | None,
1222
+ ) -> torch.Tensor:
1223
+ """
1224
+ Get the number of images for each sample.
1225
+ For GLM-Image, only input_ids allow us to get the number of images.
1226
+
1227
+ Returns:
1228
+ image_counts (`torch.LongTensor` of shape `(batch_size,)`)
1229
+ """
1230
+ is_image = input_ids == self.config.image_start_token_id
1231
+
1232
+ return is_image.sum(dim=1)
1233
+
1234
+ def _expand_inputs_for_generation(
1235
+ self,
1236
+ expand_size: int = 1,
1237
+ is_encoder_decoder: bool = False,
1238
+ input_ids: torch.LongTensor | None = None,
1239
+ **model_kwargs,
1240
+ ) -> tuple[torch.LongTensor, dict[str, Any]]:
1241
+ # Overwritten -- Support for expanding tensors without a batch size dimension
1242
+ # e.g., pixel_values, image_grid_thw
1243
+ # pixel_values.shape[0] is sum(seqlen_images for samples)
1244
+ # image_grid_thw.shape[0] is sum(num_images for samples)
1245
+
1246
+ if expand_size == 1:
1247
+ return input_ids, model_kwargs
1248
+
1249
+ visual_keys = ["pixel_values", "image_grid_thw", "images_per_sample"]
1250
+
1251
+ def _expand_dict_for_generation_visual(dict_to_expand):
1252
+ image_grid_thw = model_kwargs.get("image_grid_thw", None)
1253
+ if image_grid_thw is None:
1254
+ return dict_to_expand
1255
+
1256
+ images_per_sample = model_kwargs.get("images_per_sample", None)
1257
+
1258
+ # Use images_per_sample if available
1259
+ if images_per_sample is not None:
1260
+ image_nums = images_per_sample.tolist()
1261
+ elif input_ids is not None:
1262
+ # Try to infer from image_grid_thw / batch_size
1263
+ batch_size = input_ids.shape[0]
1264
+ total_grids = image_grid_thw.shape[0]
1265
+ if total_grids % batch_size == 0:
1266
+ grids_per_sample = total_grids // batch_size
1267
+ image_nums = [grids_per_sample] * batch_size
1268
+ else:
1269
+ # Cannot evenly distribute grids - fall back to simple repeat_interleave
1270
+ # This handles test cases where image_grid_thw has (batch_size + 1) rows
1271
+ dict_to_expand["image_grid_thw"] = image_grid_thw.repeat_interleave(expand_size, dim=0)
1272
+ if dict_to_expand.get("pixel_values") is not None:
1273
+ dict_to_expand["pixel_values"] = dict_to_expand["pixel_values"].repeat_interleave(
1274
+ expand_size, dim=0
1275
+ )
1276
+ return dict_to_expand
1277
+ else:
1278
+ image_nums = self._get_image_nums(input_ids).tolist()
1279
+
1280
+ # Get source image counts per sample from image_end_token_id count
1281
+ source_image_nums = [
1282
+ (input_ids[batch_idx] == self.config.image_end_token_id).sum().item()
1283
+ for batch_idx in range(len(image_nums))
1284
+ ]
1285
+
1286
+ def _repeat_interleave_samples(x, lengths, repeat_times):
1287
+ samples = torch.split(x, lengths)
1288
+ repeat_args = [repeat_times] + [1] * (x.dim() - 1)
1289
+ result = torch.cat([sample.repeat(*repeat_args) for sample in samples], dim=0)
1290
+ return result
1291
+
1292
+ for key in dict_to_expand:
1293
+ if key == "pixel_values":
1294
+ # Split images into samples based on source image counts
1295
+ if sum(source_image_nums) > 0:
1296
+ # Split grids by sample to compute pixel counts
1297
+ grids_per_sample = torch.split(image_grid_thw, image_nums)
1298
+ lengths = []
1299
+ for batch_idx, sample_grids in enumerate(grids_per_sample):
1300
+ num_source = source_image_nums[batch_idx]
1301
+ if num_source > 0:
1302
+ source_grids = sample_grids[:num_source]
1303
+ lengths.append(torch.prod(source_grids, dim=1).sum().item())
1304
+ else:
1305
+ lengths.append(0)
1306
+
1307
+ dict_to_expand[key] = _repeat_interleave_samples(
1308
+ dict_to_expand[key], lengths=lengths, repeat_times=expand_size
1309
+ )
1310
+ elif key == "image_grid_thw":
1311
+ # Expand all grids (source + target) per sample
1312
+ dict_to_expand[key] = _repeat_interleave_samples(
1313
+ dict_to_expand[key], lengths=image_nums, repeat_times=expand_size
1314
+ )
1315
+ elif key == "images_per_sample":
1316
+ # Simply repeat the counts
1317
+ if dict_to_expand.get(key) is not None:
1318
+ dict_to_expand[key] = dict_to_expand[key].repeat_interleave(expand_size, dim=0)
1319
+ return dict_to_expand
1320
+
1321
+ def _expand_dict_for_generation(dict_to_expand):
1322
+ for key in dict_to_expand:
1323
+ if (
1324
+ key != "cache_position"
1325
+ and dict_to_expand[key] is not None
1326
+ and isinstance(dict_to_expand[key], torch.Tensor)
1327
+ and key not in visual_keys
1328
+ ):
1329
+ dict_to_expand[key] = dict_to_expand[key].repeat_interleave(expand_size, dim=0)
1330
+ return dict_to_expand
1331
+
1332
+ model_kwargs = _expand_dict_for_generation_visual(model_kwargs)
1333
+
1334
+ if input_ids is not None:
1335
+ input_ids = input_ids.repeat_interleave(expand_size, dim=0)
1336
+
1337
+ model_kwargs = _expand_dict_for_generation(model_kwargs)
1338
+
1339
+ if is_encoder_decoder:
1340
+ if model_kwargs.get("encoder_outputs") is None:
1341
+ raise ValueError("If `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined.")
1342
+ model_kwargs["encoder_outputs"] = _expand_dict_for_generation(model_kwargs["encoder_outputs"])
1343
+
1344
+ return input_ids, model_kwargs
1345
+
1346
+
1347
+ def smart_resize(
1348
+ height: int,
1349
+ width: int,
1350
+ factor: int = 16,
1351
+ min_pixels: int = 512 * 512,
1352
+ max_pixels: int = 2048 * 2048,
1353
+ ) -> tuple[int, int]:
1354
+ if height < factor or width < factor:
1355
+ raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}")
1356
+ elif max(height, width) / min(height, width) > 4:
1357
+ raise ValueError(
1358
+ f"absolute aspect ratio must be smaller than 4, got {max(height, width) / min(height, width)}"
1359
+ )
1360
+
1361
+ shortest_edge = int(round(math.sqrt(min_pixels)))
1362
+ longest_edge = int(round(math.sqrt(max_pixels)))
1363
+ min_side = min(height, width)
1364
+ max_side = max(height, width)
1365
+
1366
+ scale = 1.0
1367
+
1368
+ if min_side < shortest_edge:
1369
+ scale = shortest_edge / min_side
1370
+
1371
+ if max_side * scale > longest_edge:
1372
+ scale = longest_edge / max_side
1373
+
1374
+ height = height // 2
1375
+ width = width // 2
1376
+
1377
+ h_bar = max(factor, int(round(height * scale / factor)) * factor)
1378
+ w_bar = max(factor, int(round(width * scale / factor)) * factor)
1379
+
1380
+ if max(h_bar, w_bar) > longest_edge:
1381
+ beta = max(h_bar, w_bar) / longest_edge
1382
+ h_bar = max(factor, int(math.floor((h_bar / beta) / factor)) * factor)
1383
+ w_bar = max(factor, int(math.floor((w_bar / beta) / factor)) * factor)
1384
+
1385
+ return h_bar, w_bar
1386
+
1387
+
1388
+ class GlmImageImageProcessor(Qwen2VLImageProcessor):
1389
+ model_input_names = ["pixel_values", "image_grid_thw", "images_per_sample"]
1390
+
1391
+
1392
+ class GlmImageImageProcessorFast(Qwen2VLImageProcessorFast):
1393
+ model_input_names = ["pixel_values", "image_grid_thw", "images_per_sample"]
1394
+
1395
+
1396
+ class GlmImageImagesKwargs(ImagesKwargs, total=False):
1397
+ """
1398
+ target_h (`int`):
1399
+ Height of the target image to be generated.
1400
+ target_w (`int`):
1401
+ Width of the target image to be generated.
1402
+ """
1403
+
1404
+ target_h: int
1405
+ target_w: int
1406
+
1407
+
1408
+ class GlmImageProcessorKwargs(Qwen2VLProcessorKwargs):
1409
+ images_kwargs: GlmImageImagesKwargs
1410
+
1411
+ _defaults = {
1412
+ "text_kwargs": {
1413
+ "padding": False,
1414
+ "return_mm_token_type_ids": False,
1415
+ },
1416
+ "images_kwargs": {
1417
+ "target_h": 1152,
1418
+ "target_w": 768,
1419
+ },
1420
+ }
1421
+
1422
+
1423
+ class GlmImageProcessor(ProcessorMixin):
1424
+ r"""
1425
+ Constructs a GLM-Image processor which wraps a GLM-Image image processor and a GLM-Image tokenizer into a single processor.
1426
+ [`~GlmImageProcessor.__call__`] and [`~GlmImageProcessor.decode`] for more information.
1427
+ Args:
1428
+ image_processor ([`GlmImageProcessor`], *optional*):
1429
+ The image processor is a required input.
1430
+ tokenizer ([`PreTrainedTokenizerFast`], *optional*):
1431
+ The tokenizer is a required input.
1432
+ chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
1433
+ in a chat into a tokenizable string.
1434
+ """
1435
+
1436
+ model_input_names = ["input_ids", "attention_mask", "pixel_values", "image_grid_thw", "images_per_sample"]
1437
+
1438
+ def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
1439
+ self.image_token = tokenizer.image_token
1440
+ self.grid_bos_token = tokenizer.grid_bos_token
1441
+ self.grid_eos_token = tokenizer.grid_eos_token
1442
+ self.bos_token = tokenizer.bos_token
1443
+ self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
1444
+ super().__init__(image_processor, tokenizer, chat_template=chat_template)
1445
+
1446
+ def __call__(
1447
+ self,
1448
+ images: ImageInput | None = None,
1449
+ text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] = None,
1450
+ **kwargs: Unpack[GlmImageProcessorKwargs],
1451
+ ) -> BatchFeature:
1452
+ """
1453
+ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
1454
+ and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] if `text` is not `None` to encode
1455
+ the text.
1456
+
1457
+ Args:
1458
+ images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
1459
+ The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
1460
+ tensor. Both channels-first and channels-last formats are supported.
1461
+ text (`str`, `List[str]`, `List[List[str]]`):
1462
+ The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
1463
+ (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
1464
+ `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
1465
+ return_tensors (`str` or [`~utils.TensorType`], *optional*):
1466
+ If set, will return tensors of a particular framework. Acceptable values are:
1467
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
1468
+ - `'np'`: Return NumPy `np.ndarray` objects.
1469
+
1470
+ Returns:
1471
+ [`BatchFeature`]: A [`BatchFeature`] with the following fields:
1472
+
1473
+ - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
1474
+ - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
1475
+ `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
1476
+ `None`).
1477
+ - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
1478
+ - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
1479
+ """
1480
+ output_kwargs = self._merge_kwargs(
1481
+ GlmImageProcessorKwargs,
1482
+ tokenizer_init_kwargs=self.tokenizer.init_kwargs,
1483
+ **kwargs,
1484
+ )
1485
+
1486
+ target_h = output_kwargs["images_kwargs"].pop("target_h", None)
1487
+ target_w = output_kwargs["images_kwargs"].pop("target_w", None)
1488
+ is_text_to_image = images is None
1489
+
1490
+ if images is not None:
1491
+ image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
1492
+ image_grid_thw = image_inputs["image_grid_thw"]
1493
+ else:
1494
+ image_inputs = {}
1495
+ image_grid_thw = None
1496
+
1497
+ # Handle text=None case (image-only processing)
1498
+ if text is None:
1499
+ if images is None:
1500
+ raise ValueError("You must provide at least one of `text` or `images`.")
1501
+ return image_inputs
1502
+
1503
+ if not isinstance(text, list):
1504
+ text = [text]
1505
+
1506
+ batch_size = len(text)
1507
+ text = text.copy() # below lines change text in-place
1508
+
1509
+ # Count images per sample by counting image tokens in each text
1510
+ images_per_sample = []
1511
+ for i in range(batch_size):
1512
+ images_per_sample.append(text[i].count(self.image_token))
1513
+
1514
+ # Replace image tokens with the correct number of placeholder tokens
1515
+ if not is_text_to_image:
1516
+ index = 0
1517
+ for i in range(batch_size):
1518
+ while self.image_token in text[i]:
1519
+ grid = image_grid_thw[index]
1520
+ num_image_tokens = int(grid[1] * grid[2])
1521
+ text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1)
1522
+ index += 1
1523
+ text[i] = text[i].replace("<|placeholder|>", self.image_token)
1524
+
1525
+ # Build prompt with target shape and combine grids in a single loop
1526
+ # Format: [sample0_source_grids..., sample0_target_grids, sample1_source_grids..., sample1_target_grids, ...]
1527
+ # Note: In i2i mode, batches are homogeneous (same number of source images per sample)
1528
+ num_source_images = images_per_sample[0] if images_per_sample else 0
1529
+
1530
+ # Validate homogeneity for i2i mode
1531
+ if not is_text_to_image and images_per_sample and len(set(images_per_sample)) != 1:
1532
+ raise ValueError(
1533
+ f"In image-to-image mode, all samples must have the same number of source images. "
1534
+ f"Got different counts: {images_per_sample}"
1535
+ )
1536
+
1537
+ all_grids = []
1538
+ for i in range(batch_size):
1539
+ text[i], token_h, token_w, prev_h, prev_w = self._build_prompt_with_target_shape(
1540
+ text[i], height=target_h, width=target_w, is_text_to_image=is_text_to_image
1541
+ )
1542
+ # Add source grids for this sample (i2i mode only)
1543
+ if not is_text_to_image and num_source_images > 0:
1544
+ start_idx = i * num_source_images
1545
+ all_grids.append(image_grid_thw[start_idx : start_idx + num_source_images])
1546
+ # Add target grid for this sample
1547
+ all_grids.append(
1548
+ self._build_target_image_grid_thw(
1549
+ token_h=token_h,
1550
+ token_w=token_w,
1551
+ prev_token_h=prev_h,
1552
+ prev_token_w=prev_w,
1553
+ is_text_to_image=is_text_to_image,
1554
+ )
1555
+ )
1556
+ image_inputs["image_grid_thw"] = torch.cat(all_grids, dim=0)
1557
+
1558
+ # Store images_per_sample for later use (add target images count)
1559
+ # Each sample will have: source_images + target_images (typically 2 for t2i, 1 for i2i)
1560
+ num_target_grids = 2 if is_text_to_image else 1
1561
+ image_inputs["images_per_sample"] = torch.tensor(
1562
+ [num_source_images + num_target_grids] * batch_size, dtype=torch.long
1563
+ )
1564
+
1565
+ return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
1566
+ return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False)
1567
+ text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
1568
+
1569
+ self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
1570
+
1571
+ if return_mm_token_type_ids:
1572
+ array_ids = np.array(text_inputs["input_ids"])
1573
+ mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
1574
+ mm_token_type_ids[array_ids == self.image_token_id] = 1
1575
+ text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
1576
+ return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
1577
+
1578
+ def _build_prompt_with_target_shape(
1579
+ self,
1580
+ prompt: str,
1581
+ height: int,
1582
+ width: int,
1583
+ is_text_to_image: bool,
1584
+ ) -> tuple[str, int, int, int, int]:
1585
+ factor = 32
1586
+ height = (height // factor) * factor
1587
+ width = (width // factor) * factor
1588
+ token_h = height // factor
1589
+ token_w = width // factor
1590
+ ratio = token_h / token_w
1591
+ prev_token_h = int(math.sqrt(ratio) * (factor // 2))
1592
+ prev_token_w = int(math.sqrt(1 / ratio) * (factor // 2))
1593
+
1594
+ if is_text_to_image:
1595
+ expanded_prompt = f"{prompt}{self.grid_bos_token}{token_h} {token_w}{self.grid_eos_token}{self.grid_bos_token}{prev_token_h} {prev_token_w}{self.grid_eos_token}{self.bos_token}"
1596
+ else:
1597
+ expanded_prompt = f"{prompt}{self.grid_bos_token}{token_h} {token_w}{self.grid_eos_token}{self.bos_token}"
1598
+
1599
+ return expanded_prompt, token_h, token_w, prev_token_h, prev_token_w
1600
+
1601
+ @staticmethod
1602
+ def _build_target_image_grid_thw(
1603
+ token_h: int,
1604
+ token_w: int,
1605
+ prev_token_h: int,
1606
+ prev_token_w: int,
1607
+ is_text_to_image: bool = True,
1608
+ ):
1609
+ if is_text_to_image:
1610
+ # Text-to-image: 2 target grids (large + small preview)
1611
+ return torch.tensor(
1612
+ [
1613
+ [1, token_h, token_w],
1614
+ [1, prev_token_h, prev_token_w],
1615
+ ],
1616
+ )
1617
+ else:
1618
+ # Image-to-image: 1 target grid only
1619
+ return torch.tensor(
1620
+ [
1621
+ [1, token_h, token_w],
1622
+ ],
1623
+ )
1624
+
1625
+
1626
+ __all__ = [
1627
+ "GlmImageVQVAEConfig",
1628
+ "GlmImageVisionConfig",
1629
+ "GlmImageTextConfig",
1630
+ "GlmImageConfig",
1631
+ "GlmImagePreTrainedModel",
1632
+ "GlmImageVQVAE",
1633
+ "GlmImageVisionModel",
1634
+ "GlmImageTextModel",
1635
+ "GlmImageModel",
1636
+ "GlmImageForConditionalGeneration",
1637
+ "GlmImageImageProcessor",
1638
+ "GlmImageImageProcessorFast",
1639
+ "GlmImageProcessor",
1640
+ ]