transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (539) hide show
  1. transformers/__init__.py +30 -3
  2. transformers/cli/serve.py +47 -17
  3. transformers/conversion_mapping.py +15 -2
  4. transformers/convert_slow_tokenizer.py +225 -10
  5. transformers/core_model_loading.py +196 -135
  6. transformers/data/data_collator.py +12 -4
  7. transformers/dependency_versions_table.py +1 -2
  8. transformers/dynamic_module_utils.py +1 -2
  9. transformers/feature_extraction_utils.py +1 -2
  10. transformers/file_utils.py +0 -1
  11. transformers/generation/__init__.py +11 -1
  12. transformers/generation/configuration_utils.py +3 -2
  13. transformers/generation/continuous_batching/__init__.py +4 -0
  14. transformers/generation/continuous_batching/continuous_api.py +134 -79
  15. transformers/image_processing_base.py +1 -2
  16. transformers/integrations/__init__.py +4 -2
  17. transformers/integrations/accelerate.py +15 -3
  18. transformers/integrations/aqlm.py +38 -66
  19. transformers/integrations/awq.py +48 -514
  20. transformers/integrations/bitnet.py +45 -100
  21. transformers/integrations/bitsandbytes.py +79 -191
  22. transformers/integrations/deepspeed.py +1 -0
  23. transformers/integrations/eetq.py +84 -79
  24. transformers/integrations/fbgemm_fp8.py +191 -145
  25. transformers/integrations/finegrained_fp8.py +236 -193
  26. transformers/integrations/fp_quant.py +92 -0
  27. transformers/integrations/ggml.py +11 -1
  28. transformers/integrations/higgs.py +40 -62
  29. transformers/integrations/hub_kernels.py +42 -3
  30. transformers/integrations/integration_utils.py +10 -0
  31. transformers/integrations/mxfp4.py +25 -65
  32. transformers/integrations/peft.py +7 -29
  33. transformers/integrations/quanto.py +73 -55
  34. transformers/integrations/quark.py +55 -0
  35. transformers/integrations/spqr.py +44 -90
  36. transformers/integrations/torchao.py +32 -38
  37. transformers/integrations/vptq.py +42 -59
  38. transformers/modelcard.py +1 -2
  39. transformers/modeling_gguf_pytorch_utils.py +8 -0
  40. transformers/modeling_rope_utils.py +30 -6
  41. transformers/modeling_utils.py +116 -112
  42. transformers/models/__init__.py +3 -0
  43. transformers/models/afmoe/modeling_afmoe.py +4 -4
  44. transformers/models/albert/tokenization_albert.py +6 -12
  45. transformers/models/align/modeling_align.py +2 -0
  46. transformers/models/altclip/modeling_altclip.py +4 -0
  47. transformers/models/apertus/modeling_apertus.py +4 -4
  48. transformers/models/arcee/modeling_arcee.py +4 -4
  49. transformers/models/aria/modeling_aria.py +4 -4
  50. transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
  51. transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
  52. transformers/models/auto/configuration_auto.py +11 -0
  53. transformers/models/auto/feature_extraction_auto.py +2 -0
  54. transformers/models/auto/image_processing_auto.py +1 -0
  55. transformers/models/auto/modeling_auto.py +6 -0
  56. transformers/models/auto/processing_auto.py +18 -10
  57. transformers/models/auto/tokenization_auto.py +74 -472
  58. transformers/models/autoformer/modeling_autoformer.py +4 -0
  59. transformers/models/bamba/modeling_bamba.py +4 -3
  60. transformers/models/bark/modeling_bark.py +2 -0
  61. transformers/models/bart/modeling_bart.py +7 -0
  62. transformers/models/barthez/tokenization_barthez.py +5 -10
  63. transformers/models/beit/modeling_beit.py +6 -1
  64. transformers/models/bert/tokenization_bert.py +8 -21
  65. transformers/models/big_bird/modeling_big_bird.py +6 -0
  66. transformers/models/big_bird/tokenization_big_bird.py +18 -42
  67. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +8 -2
  68. transformers/models/biogpt/modeling_biogpt.py +2 -0
  69. transformers/models/biogpt/modular_biogpt.py +2 -0
  70. transformers/models/bit/modeling_bit.py +11 -2
  71. transformers/models/bitnet/modeling_bitnet.py +4 -4
  72. transformers/models/blenderbot/modeling_blenderbot.py +5 -0
  73. transformers/models/blenderbot/tokenization_blenderbot.py +12 -16
  74. transformers/models/blenderbot_small/modeling_blenderbot_small.py +5 -0
  75. transformers/models/blip/modeling_blip_text.py +2 -0
  76. transformers/models/blip_2/modeling_blip_2.py +2 -1
  77. transformers/models/bloom/modeling_bloom.py +4 -0
  78. transformers/models/blt/modeling_blt.py +2 -2
  79. transformers/models/blt/modular_blt.py +2 -2
  80. transformers/models/bridgetower/modeling_bridgetower.py +5 -1
  81. transformers/models/bros/modeling_bros.py +4 -0
  82. transformers/models/camembert/tokenization_camembert.py +8 -12
  83. transformers/models/canine/modeling_canine.py +5 -0
  84. transformers/models/chameleon/modeling_chameleon.py +2 -1
  85. transformers/models/chinese_clip/modeling_chinese_clip.py +3 -0
  86. transformers/models/clap/modeling_clap.py +5 -0
  87. transformers/models/clip/tokenization_clip.py +22 -44
  88. transformers/models/clipseg/modeling_clipseg.py +5 -0
  89. transformers/models/clvp/modeling_clvp.py +5 -0
  90. transformers/models/clvp/tokenization_clvp.py +1 -63
  91. transformers/models/code_llama/tokenization_code_llama.py +20 -43
  92. transformers/models/codegen/tokenization_codegen.py +14 -43
  93. transformers/models/cohere/modeling_cohere.py +4 -3
  94. transformers/models/cohere/modular_cohere.py +2 -1
  95. transformers/models/cohere/tokenization_cohere.py +12 -42
  96. transformers/models/cohere2/modeling_cohere2.py +7 -6
  97. transformers/models/cohere2/modular_cohere2.py +5 -5
  98. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -3
  99. transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
  100. transformers/models/colqwen2/modeling_colqwen2.py +1 -0
  101. transformers/models/colqwen2/modular_colqwen2.py +1 -0
  102. transformers/models/conditional_detr/modeling_conditional_detr.py +5 -0
  103. transformers/models/convbert/modeling_convbert.py +6 -0
  104. transformers/models/convnext/modeling_convnext.py +2 -4
  105. transformers/models/convnextv2/modeling_convnextv2.py +2 -4
  106. transformers/models/csm/modeling_csm.py +4 -3
  107. transformers/models/ctrl/modeling_ctrl.py +1 -0
  108. transformers/models/cvt/modeling_cvt.py +2 -0
  109. transformers/models/cwm/modeling_cwm.py +4 -4
  110. transformers/models/d_fine/modeling_d_fine.py +2 -0
  111. transformers/models/d_fine/modular_d_fine.py +1 -0
  112. transformers/models/dab_detr/modeling_dab_detr.py +4 -0
  113. transformers/models/dac/modeling_dac.py +2 -2
  114. transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
  115. transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
  116. transformers/models/dbrx/modeling_dbrx.py +2 -2
  117. transformers/models/deberta/modeling_deberta.py +5 -0
  118. transformers/models/deberta/tokenization_deberta.py +11 -20
  119. transformers/models/deberta_v2/modeling_deberta_v2.py +6 -0
  120. transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
  121. transformers/models/decision_transformer/modeling_decision_transformer.py +4 -1
  122. transformers/models/deepseek_v2/modeling_deepseek_v2.py +2 -3
  123. transformers/models/deepseek_v2/modular_deepseek_v2.py +2 -2
  124. transformers/models/deepseek_v3/modeling_deepseek_v3.py +3 -2
  125. transformers/models/deepseek_v3/modular_deepseek_v3.py +1 -0
  126. transformers/models/deformable_detr/modeling_deformable_detr.py +4 -0
  127. transformers/models/depth_anything/modeling_depth_anything.py +1 -0
  128. transformers/models/depth_pro/modeling_depth_pro.py +2 -0
  129. transformers/models/detr/modeling_detr.py +5 -0
  130. transformers/models/dia/modeling_dia.py +4 -3
  131. transformers/models/dia/modular_dia.py +0 -1
  132. transformers/models/diffllama/modeling_diffllama.py +2 -2
  133. transformers/models/dinat/modeling_dinat.py +3 -0
  134. transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
  135. transformers/models/dinov3_vit/modeling_dinov3_vit.py +2 -2
  136. transformers/models/dinov3_vit/modular_dinov3_vit.py +2 -2
  137. transformers/models/distilbert/tokenization_distilbert.py +13 -0
  138. transformers/models/doge/modeling_doge.py +2 -3
  139. transformers/models/doge/modular_doge.py +0 -1
  140. transformers/models/donut/modeling_donut_swin.py +2 -0
  141. transformers/models/dots1/modeling_dots1.py +10 -7
  142. transformers/models/dots1/modular_dots1.py +5 -3
  143. transformers/models/dpr/modeling_dpr.py +5 -0
  144. transformers/models/dpr/tokenization_dpr.py +12 -0
  145. transformers/models/edgetam/modeling_edgetam.py +1 -1
  146. transformers/models/edgetam_video/modeling_edgetam_video.py +1 -0
  147. transformers/models/edgetam_video/modular_edgetam_video.py +1 -0
  148. transformers/models/efficientloftr/modeling_efficientloftr.py +2 -2
  149. transformers/models/efficientnet/modeling_efficientnet.py +2 -0
  150. transformers/models/emu3/modeling_emu3.py +4 -4
  151. transformers/models/eomt/image_processing_eomt.py +13 -1
  152. transformers/models/eomt/image_processing_eomt_fast.py +14 -2
  153. transformers/models/ernie4_5/modeling_ernie4_5.py +4 -4
  154. transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
  155. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +5 -5
  156. transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +2 -2
  157. transformers/models/esm/modeling_esmfold.py +5 -4
  158. transformers/models/evolla/modeling_evolla.py +4 -4
  159. transformers/models/exaone4/modeling_exaone4.py +2 -2
  160. transformers/models/exaone4/modular_exaone4.py +0 -1
  161. transformers/models/falcon/modeling_falcon.py +6 -1
  162. transformers/models/falcon_h1/modeling_falcon_h1.py +4 -3
  163. transformers/models/falcon_mamba/modeling_falcon_mamba.py +25 -35
  164. transformers/models/falcon_mamba/modular_falcon_mamba.py +12 -31
  165. transformers/{kernels/falcon_mamba → models/fast_vlm}/__init__.py +15 -3
  166. transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
  167. transformers/models/fast_vlm/modeling_fast_vlm.py +455 -0
  168. transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
  169. transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +8 -3
  170. transformers/models/flaubert/modeling_flaubert.py +7 -0
  171. transformers/models/flava/modeling_flava.py +6 -1
  172. transformers/models/flex_olmo/modeling_flex_olmo.py +4 -5
  173. transformers/models/florence2/modeling_florence2.py +2 -1
  174. transformers/models/florence2/modular_florence2.py +2 -1
  175. transformers/models/fnet/modeling_fnet.py +7 -0
  176. transformers/models/focalnet/modeling_focalnet.py +4 -0
  177. transformers/models/fsmt/modeling_fsmt.py +2 -0
  178. transformers/models/funnel/modeling_funnel.py +8 -0
  179. transformers/models/funnel/tokenization_funnel.py +17 -24
  180. transformers/models/fuyu/processing_fuyu.py +3 -3
  181. transformers/models/gemma/modeling_gemma.py +4 -4
  182. transformers/models/gemma/tokenization_gemma.py +10 -27
  183. transformers/models/gemma2/modeling_gemma2.py +4 -4
  184. transformers/models/gemma2/modular_gemma2.py +2 -1
  185. transformers/models/gemma3/modeling_gemma3.py +14 -84
  186. transformers/models/gemma3/modular_gemma3.py +12 -81
  187. transformers/models/gemma3n/modeling_gemma3n.py +18 -209
  188. transformers/models/gemma3n/modular_gemma3n.py +17 -59
  189. transformers/models/git/modeling_git.py +2 -0
  190. transformers/models/glm/modeling_glm.py +4 -4
  191. transformers/models/glm4/modeling_glm4.py +4 -4
  192. transformers/models/glm4_moe/modeling_glm4_moe.py +5 -3
  193. transformers/models/glm4v/configuration_glm4v.py +3 -1
  194. transformers/models/glm4v/modeling_glm4v.py +3 -3
  195. transformers/models/glm4v/modular_glm4v.py +6 -4
  196. transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
  197. transformers/models/glm4v_moe/modeling_glm4v_moe.py +6 -5
  198. transformers/models/glm4v_moe/modular_glm4v_moe.py +1 -1
  199. transformers/models/glpn/modeling_glpn.py +2 -0
  200. transformers/models/gpt2/modeling_gpt2.py +5 -1
  201. transformers/models/gpt2/tokenization_gpt2.py +16 -44
  202. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +1 -0
  203. transformers/models/gpt_neo/modeling_gpt_neo.py +4 -0
  204. transformers/models/gpt_neox/modeling_gpt_neox.py +5 -2
  205. transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
  206. transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
  207. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +3 -1
  208. transformers/models/gpt_oss/modeling_gpt_oss.py +5 -6
  209. transformers/models/gpt_oss/modular_gpt_oss.py +3 -5
  210. transformers/models/gptj/modeling_gptj.py +3 -0
  211. transformers/models/granite/modeling_granite.py +4 -4
  212. transformers/models/granitemoe/modeling_granitemoe.py +4 -6
  213. transformers/models/granitemoe/modular_granitemoe.py +0 -2
  214. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +4 -6
  215. transformers/models/granitemoeshared/modeling_granitemoeshared.py +4 -6
  216. transformers/models/grounding_dino/modeling_grounding_dino.py +4 -0
  217. transformers/models/groupvit/modeling_groupvit.py +3 -0
  218. transformers/models/helium/modeling_helium.py +4 -3
  219. transformers/models/herbert/tokenization_herbert.py +9 -25
  220. transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -1
  221. transformers/models/hgnet_v2/modular_hgnet_v2.py +6 -1
  222. transformers/models/hiera/modeling_hiera.py +4 -0
  223. transformers/models/hubert/modeling_hubert.py +3 -0
  224. transformers/models/hubert/modular_hubert.py +1 -0
  225. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +4 -4
  226. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +4 -4
  227. transformers/models/ibert/modeling_ibert.py +6 -0
  228. transformers/models/idefics/modeling_idefics.py +5 -21
  229. transformers/models/imagegpt/modeling_imagegpt.py +2 -1
  230. transformers/models/informer/modeling_informer.py +4 -0
  231. transformers/models/informer/modular_informer.py +1 -0
  232. transformers/models/internvl/modeling_internvl.py +2 -4
  233. transformers/models/internvl/modular_internvl.py +2 -4
  234. transformers/models/jamba/modeling_jamba.py +2 -2
  235. transformers/models/janus/modeling_janus.py +1 -0
  236. transformers/models/janus/modular_janus.py +1 -0
  237. transformers/models/jetmoe/modeling_jetmoe.py +2 -2
  238. transformers/models/kosmos2/modeling_kosmos2.py +1 -0
  239. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +3 -1
  240. transformers/models/lasr/__init__.py +29 -0
  241. transformers/models/lasr/configuration_lasr.py +244 -0
  242. transformers/models/lasr/feature_extraction_lasr.py +277 -0
  243. transformers/models/lasr/modeling_lasr.py +729 -0
  244. transformers/models/lasr/modular_lasr.py +569 -0
  245. transformers/models/lasr/processing_lasr.py +96 -0
  246. transformers/models/lasr/tokenization_lasr.py +186 -0
  247. transformers/models/layoutlm/modeling_layoutlm.py +5 -0
  248. transformers/models/layoutlmv2/modeling_layoutlmv2.py +4 -0
  249. transformers/models/layoutlmv2/tokenization_layoutlmv2.py +10 -53
  250. transformers/models/layoutlmv3/modeling_layoutlmv3.py +4 -0
  251. transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
  252. transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
  253. transformers/models/led/modeling_led.py +6 -0
  254. transformers/models/levit/modeling_levit.py +3 -0
  255. transformers/models/lfm2/modeling_lfm2.py +4 -5
  256. transformers/models/lfm2/modular_lfm2.py +0 -1
  257. transformers/models/lfm2_moe/modeling_lfm2_moe.py +4 -5
  258. transformers/models/lightglue/modeling_lightglue.py +3 -1
  259. transformers/models/lightglue/modular_lightglue.py +1 -0
  260. transformers/models/lilt/modeling_lilt.py +4 -0
  261. transformers/models/llama/modeling_llama.py +4 -4
  262. transformers/models/llama/tokenization_llama.py +15 -43
  263. transformers/models/llama4/modeling_llama4.py +3 -2
  264. transformers/models/longcat_flash/modeling_longcat_flash.py +4 -4
  265. transformers/models/longcat_flash/modular_longcat_flash.py +2 -2
  266. transformers/models/longformer/modeling_longformer.py +6 -0
  267. transformers/models/longt5/modeling_longt5.py +4 -0
  268. transformers/models/luke/modeling_luke.py +9 -0
  269. transformers/models/luke/tokenization_luke.py +11 -38
  270. transformers/models/lxmert/modeling_lxmert.py +2 -0
  271. transformers/models/m2m_100/modeling_m2m_100.py +4 -0
  272. transformers/models/mamba/modeling_mamba.py +14 -22
  273. transformers/models/marian/modeling_marian.py +5 -0
  274. transformers/models/markuplm/modeling_markuplm.py +4 -0
  275. transformers/models/markuplm/tokenization_markuplm.py +28 -61
  276. transformers/models/mask2former/modeling_mask2former.py +2 -0
  277. transformers/models/maskformer/modeling_maskformer.py +2 -0
  278. transformers/models/maskformer/modeling_maskformer_swin.py +2 -0
  279. transformers/models/mbart/modeling_mbart.py +7 -0
  280. transformers/models/mbart/tokenization_mbart.py +11 -52
  281. transformers/models/mbart50/tokenization_mbart50.py +7 -10
  282. transformers/models/megatron_bert/modeling_megatron_bert.py +7 -0
  283. transformers/models/mgp_str/modeling_mgp_str.py +2 -0
  284. transformers/models/mimi/modeling_mimi.py +3 -1
  285. transformers/models/minimax/modeling_minimax.py +4 -4
  286. transformers/models/ministral/modeling_ministral.py +4 -4
  287. transformers/models/ministral3/configuration_ministral3.py +1 -1
  288. transformers/models/ministral3/modeling_ministral3.py +4 -3
  289. transformers/models/mistral/modeling_mistral.py +4 -3
  290. transformers/models/mixtral/modeling_mixtral.py +4 -4
  291. transformers/models/mllama/modeling_mllama.py +2 -2
  292. transformers/models/mluke/tokenization_mluke.py +6 -6
  293. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +4 -0
  294. transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
  295. transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
  296. transformers/models/mobilevit/modeling_mobilevit.py +3 -0
  297. transformers/models/mobilevitv2/modeling_mobilevitv2.py +3 -0
  298. transformers/models/modernbert/modeling_modernbert.py +4 -1
  299. transformers/models/modernbert/modular_modernbert.py +2 -0
  300. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +8 -9
  301. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +6 -7
  302. transformers/models/moonshine/modeling_moonshine.py +4 -2
  303. transformers/models/moshi/modeling_moshi.py +5 -2
  304. transformers/models/mpnet/modeling_mpnet.py +5 -0
  305. transformers/models/mpnet/tokenization_mpnet.py +5 -13
  306. transformers/models/mpt/modeling_mpt.py +2 -0
  307. transformers/models/mra/modeling_mra.py +6 -0
  308. transformers/models/mt5/modeling_mt5.py +7 -0
  309. transformers/models/musicgen/modeling_musicgen.py +2 -0
  310. transformers/models/musicgen_melody/modeling_musicgen_melody.py +3 -0
  311. transformers/models/mvp/modeling_mvp.py +7 -0
  312. transformers/models/nanochat/modeling_nanochat.py +4 -4
  313. transformers/models/nemotron/modeling_nemotron.py +4 -2
  314. transformers/models/nllb/tokenization_nllb.py +8 -22
  315. transformers/models/nougat/tokenization_nougat.py +11 -59
  316. transformers/models/nystromformer/modeling_nystromformer.py +6 -0
  317. transformers/models/olmo/modeling_olmo.py +4 -4
  318. transformers/models/olmo/modular_olmo.py +2 -2
  319. transformers/models/olmo2/modeling_olmo2.py +4 -5
  320. transformers/models/olmo2/modular_olmo2.py +0 -1
  321. transformers/models/olmo3/modeling_olmo3.py +4 -4
  322. transformers/models/olmoe/modeling_olmoe.py +4 -4
  323. transformers/models/omdet_turbo/modeling_omdet_turbo.py +2 -0
  324. transformers/models/oneformer/modeling_oneformer.py +4 -1
  325. transformers/models/openai/modeling_openai.py +3 -0
  326. transformers/models/openai/tokenization_openai.py +10 -46
  327. transformers/models/opt/modeling_opt.py +2 -0
  328. transformers/models/owlv2/modeling_owlv2.py +4 -0
  329. transformers/models/owlvit/modeling_owlvit.py +4 -0
  330. transformers/models/paddleocr_vl/__init__.py +32 -0
  331. transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
  332. transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +503 -0
  333. transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
  334. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1668 -0
  335. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1349 -0
  336. transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
  337. transformers/models/parakeet/configuration_parakeet.py +4 -6
  338. transformers/models/parakeet/modeling_parakeet.py +9 -6
  339. transformers/models/parakeet/modular_parakeet.py +2 -2
  340. transformers/models/parakeet/processing_parakeet.py +1 -0
  341. transformers/models/patchtsmixer/modeling_patchtsmixer.py +6 -0
  342. transformers/models/patchtst/modeling_patchtst.py +20 -2
  343. transformers/models/pegasus/modeling_pegasus.py +5 -0
  344. transformers/models/pegasus/tokenization_pegasus.py +17 -44
  345. transformers/models/pegasus_x/modeling_pegasus_x.py +4 -0
  346. transformers/models/perceiver/modeling_perceiver.py +8 -0
  347. transformers/models/persimmon/modeling_persimmon.py +2 -1
  348. transformers/models/phi/modeling_phi.py +4 -5
  349. transformers/models/phi/modular_phi.py +0 -1
  350. transformers/models/phi3/modeling_phi3.py +2 -1
  351. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +5 -5
  352. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +4 -4
  353. transformers/models/phimoe/modeling_phimoe.py +4 -4
  354. transformers/models/phimoe/modular_phimoe.py +2 -2
  355. transformers/models/pix2struct/modeling_pix2struct.py +2 -0
  356. transformers/models/pixtral/modeling_pixtral.py +2 -1
  357. transformers/models/plbart/modeling_plbart.py +6 -0
  358. transformers/models/plbart/modular_plbart.py +2 -0
  359. transformers/models/plbart/tokenization_plbart.py +0 -2
  360. transformers/models/poolformer/modeling_poolformer.py +2 -0
  361. transformers/models/pop2piano/modeling_pop2piano.py +2 -0
  362. transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
  363. transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
  364. transformers/models/prophetnet/modeling_prophetnet.py +3 -0
  365. transformers/models/pvt/modeling_pvt.py +2 -0
  366. transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
  367. transformers/models/qwen2/modeling_qwen2.py +4 -4
  368. transformers/models/qwen2/tokenization_qwen2.py +14 -18
  369. transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
  370. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +13 -16
  371. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +14 -16
  372. transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
  373. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +5 -6
  374. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +3 -5
  375. transformers/models/qwen2_audio/modeling_qwen2_audio.py +2 -0
  376. transformers/models/qwen2_moe/modeling_qwen2_moe.py +4 -4
  377. transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
  378. transformers/models/qwen2_vl/modeling_qwen2_vl.py +6 -16
  379. transformers/models/qwen3/modeling_qwen3.py +4 -4
  380. transformers/models/qwen3_moe/modeling_qwen3_moe.py +4 -4
  381. transformers/models/qwen3_next/modeling_qwen3_next.py +4 -3
  382. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +21 -23
  383. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +14 -16
  384. transformers/models/qwen3_vl/modeling_qwen3_vl.py +39 -37
  385. transformers/models/qwen3_vl/modular_qwen3_vl.py +37 -35
  386. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +39 -37
  387. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +4 -1
  388. transformers/models/rag/modeling_rag.py +1 -0
  389. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +15 -1
  390. transformers/models/reformer/modeling_reformer.py +4 -0
  391. transformers/models/reformer/tokenization_reformer.py +11 -28
  392. transformers/models/regnet/modeling_regnet.py +6 -1
  393. transformers/models/rembert/modeling_rembert.py +6 -0
  394. transformers/models/rembert/tokenization_rembert.py +3 -10
  395. transformers/models/resnet/modeling_resnet.py +11 -2
  396. transformers/models/roberta/tokenization_roberta.py +18 -27
  397. transformers/models/roformer/modeling_roformer.py +6 -0
  398. transformers/models/roformer/tokenization_roformer.py +77 -412
  399. transformers/models/rt_detr/modeling_rt_detr.py +2 -0
  400. transformers/models/rt_detr/modeling_rt_detr_resnet.py +5 -1
  401. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +2 -0
  402. transformers/models/rwkv/modeling_rwkv.py +1 -0
  403. transformers/models/sam2/modeling_sam2.py +2 -2
  404. transformers/models/sam2/modular_sam2.py +2 -2
  405. transformers/models/sam2_video/modeling_sam2_video.py +1 -0
  406. transformers/models/sam2_video/modular_sam2_video.py +1 -0
  407. transformers/models/sam3/modeling_sam3.py +77 -80
  408. transformers/models/sam3_tracker/modeling_sam3_tracker.py +6 -1
  409. transformers/models/sam3_tracker/modular_sam3_tracker.py +6 -1
  410. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +1 -0
  411. transformers/models/sam3_video/modeling_sam3_video.py +1 -0
  412. transformers/models/seamless_m4t/modeling_seamless_m4t.py +5 -1
  413. transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
  414. transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +5 -1
  415. transformers/models/seed_oss/modeling_seed_oss.py +2 -2
  416. transformers/models/segformer/modeling_segformer.py +4 -1
  417. transformers/models/seggpt/modeling_seggpt.py +2 -0
  418. transformers/models/sew/modeling_sew.py +3 -0
  419. transformers/models/sew/modular_sew.py +1 -0
  420. transformers/models/sew_d/modeling_sew_d.py +3 -0
  421. transformers/models/siglip2/modeling_siglip2.py +4 -0
  422. transformers/models/siglip2/modular_siglip2.py +4 -0
  423. transformers/models/smollm3/modeling_smollm3.py +4 -4
  424. transformers/models/smolvlm/processing_smolvlm.py +0 -7
  425. transformers/models/speech_to_text/modeling_speech_to_text.py +4 -0
  426. transformers/models/speecht5/modeling_speecht5.py +13 -1
  427. transformers/models/splinter/modeling_splinter.py +3 -0
  428. transformers/models/splinter/tokenization_splinter.py +9 -28
  429. transformers/models/squeezebert/modeling_squeezebert.py +6 -0
  430. transformers/models/stablelm/modeling_stablelm.py +3 -1
  431. transformers/models/starcoder2/modeling_starcoder2.py +4 -3
  432. transformers/models/superglue/modeling_superglue.py +1 -0
  433. transformers/models/superpoint/modeling_superpoint.py +1 -0
  434. transformers/models/swiftformer/modeling_swiftformer.py +2 -0
  435. transformers/models/swin/modeling_swin.py +4 -0
  436. transformers/models/swin2sr/modeling_swin2sr.py +2 -0
  437. transformers/models/swinv2/modeling_swinv2.py +4 -0
  438. transformers/models/t5/modeling_t5.py +7 -0
  439. transformers/models/t5/tokenization_t5.py +4 -8
  440. transformers/models/t5gemma/modeling_t5gemma.py +5 -5
  441. transformers/models/t5gemma2/modeling_t5gemma2.py +6 -6
  442. transformers/models/table_transformer/modeling_table_transformer.py +4 -0
  443. transformers/models/tapas/modeling_tapas.py +3 -0
  444. transformers/models/textnet/modeling_textnet.py +11 -2
  445. transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
  446. transformers/models/timesfm/modeling_timesfm.py +2 -0
  447. transformers/models/timesfm/modular_timesfm.py +2 -0
  448. transformers/models/timesformer/modeling_timesformer.py +2 -0
  449. transformers/models/timm_wrapper/modeling_timm_wrapper.py +1 -1
  450. transformers/models/trocr/modeling_trocr.py +2 -0
  451. transformers/models/tvp/modeling_tvp.py +2 -0
  452. transformers/models/udop/modeling_udop.py +4 -0
  453. transformers/models/udop/tokenization_udop.py +5 -13
  454. transformers/models/umt5/modeling_umt5.py +7 -0
  455. transformers/models/unispeech/modeling_unispeech.py +4 -0
  456. transformers/models/unispeech/modular_unispeech.py +2 -0
  457. transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
  458. transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
  459. transformers/models/univnet/modeling_univnet.py +1 -0
  460. transformers/models/upernet/modeling_upernet.py +1 -0
  461. transformers/models/vaultgemma/modeling_vaultgemma.py +4 -4
  462. transformers/models/vilt/modeling_vilt.py +6 -0
  463. transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
  464. transformers/models/visual_bert/modeling_visual_bert.py +6 -0
  465. transformers/models/vitdet/modeling_vitdet.py +2 -0
  466. transformers/models/vitmatte/modeling_vitmatte.py +1 -0
  467. transformers/models/vits/modeling_vits.py +1 -0
  468. transformers/models/vjepa2/modeling_vjepa2.py +1 -0
  469. transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
  470. transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +5 -0
  471. transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +5 -0
  472. transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +6 -0
  473. transformers/models/wavlm/modeling_wavlm.py +5 -0
  474. transformers/models/whisper/modeling_whisper.py +6 -0
  475. transformers/models/whisper/tokenization_whisper.py +4 -15
  476. transformers/models/x_clip/modeling_x_clip.py +3 -0
  477. transformers/models/xglm/modeling_xglm.py +1 -0
  478. transformers/models/xglm/tokenization_xglm.py +4 -9
  479. transformers/models/xlm/modeling_xlm.py +5 -0
  480. transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
  481. transformers/models/xlnet/tokenization_xlnet.py +3 -7
  482. transformers/models/yoso/modeling_yoso.py +6 -0
  483. transformers/models/zamba/modeling_zamba.py +2 -0
  484. transformers/models/zamba2/modeling_zamba2.py +4 -2
  485. transformers/models/zamba2/modular_zamba2.py +1 -1
  486. transformers/models/zoedepth/modeling_zoedepth.py +1 -0
  487. transformers/pipelines/__init__.py +2 -3
  488. transformers/pipelines/base.py +1 -9
  489. transformers/pipelines/document_question_answering.py +3 -1
  490. transformers/pipelines/text_generation.py +1 -1
  491. transformers/processing_utils.py +23 -11
  492. transformers/quantizers/base.py +35 -110
  493. transformers/quantizers/quantizer_aqlm.py +1 -5
  494. transformers/quantizers/quantizer_auto_round.py +1 -2
  495. transformers/quantizers/quantizer_awq.py +17 -81
  496. transformers/quantizers/quantizer_bitnet.py +3 -8
  497. transformers/quantizers/quantizer_bnb_4bit.py +13 -110
  498. transformers/quantizers/quantizer_bnb_8bit.py +16 -92
  499. transformers/quantizers/quantizer_compressed_tensors.py +1 -5
  500. transformers/quantizers/quantizer_eetq.py +14 -62
  501. transformers/quantizers/quantizer_fbgemm_fp8.py +34 -125
  502. transformers/quantizers/quantizer_finegrained_fp8.py +13 -105
  503. transformers/quantizers/quantizer_fp_quant.py +48 -78
  504. transformers/quantizers/quantizer_gptq.py +7 -24
  505. transformers/quantizers/quantizer_higgs.py +40 -54
  506. transformers/quantizers/quantizer_hqq.py +144 -153
  507. transformers/quantizers/quantizer_mxfp4.py +13 -167
  508. transformers/quantizers/quantizer_quanto.py +20 -64
  509. transformers/quantizers/quantizer_quark.py +36 -17
  510. transformers/quantizers/quantizer_spqr.py +1 -4
  511. transformers/quantizers/quantizer_torchao.py +23 -202
  512. transformers/quantizers/quantizer_vptq.py +8 -22
  513. transformers/quantizers/quantizers_utils.py +20 -0
  514. transformers/testing_utils.py +297 -36
  515. transformers/tokenization_mistral_common.py +4 -0
  516. transformers/tokenization_utils_base.py +113 -222
  517. transformers/tokenization_utils_tokenizers.py +168 -107
  518. transformers/trainer.py +28 -31
  519. transformers/trainer_jit_checkpoint.py +126 -0
  520. transformers/trainer_utils.py +1 -1
  521. transformers/training_args.py +66 -28
  522. transformers/utils/__init__.py +3 -4
  523. transformers/utils/auto_docstring.py +1 -0
  524. transformers/utils/generic.py +27 -1
  525. transformers/utils/hub.py +5 -15
  526. transformers/utils/import_utils.py +61 -16
  527. transformers/utils/kernel_config.py +4 -2
  528. transformers/utils/loading_report.py +19 -10
  529. transformers/utils/quantization_config.py +75 -242
  530. transformers/video_processing_utils.py +1 -2
  531. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/METADATA +274 -227
  532. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/RECORD +536 -520
  533. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/WHEEL +1 -1
  534. transformers/kernels/__init__.py +0 -0
  535. transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
  536. transformers/models/roformer/tokenization_roformer_fast.py +0 -160
  537. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/entry_points.txt +0 -0
  538. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info/licenses}/LICENSE +0 -0
  539. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1349 @@
1
+ # Copyright 2025 The PaddlePaddle Team and The HuggingFace Inc. team. All rights reserved.
2
+ #
3
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
4
+ # and OPT implementations in this library. It has been modified from its
5
+ # original forms to accommodate minor architectural differences compared
6
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
7
+ #
8
+ # Licensed under the Apache License, Version 2.0 (the "License");
9
+ # you may not use this file except in compliance with the License.
10
+ # You may obtain a copy of the License at
11
+ #
12
+ # http://www.apache.org/licenses/LICENSE-2.0
13
+ #
14
+ # Unless required by applicable law or agreed to in writing, software
15
+ # distributed under the License is distributed on an "AS IS" BASIS,
16
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17
+ # See the License for the specific language governing permissions and
18
+ # limitations under the License.
19
+
20
+ import math
21
+ from typing import Optional, Union
22
+
23
+ import numpy as np
24
+ import torch
25
+ import torch.nn.functional as F
26
+ from torch import nn
27
+
28
+ from ...activations import GELUActivation
29
+ from ...cache_utils import Cache, DynamicCache
30
+ from ...image_processing_utils import BatchFeature
31
+ from ...image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images
32
+ from ...image_transforms import convert_to_rgb, resize, to_channel_dimension_format
33
+ from ...image_utils import (
34
+ OPENAI_CLIP_MEAN,
35
+ OPENAI_CLIP_STD,
36
+ ChannelDimension,
37
+ ImageInput,
38
+ PILImageResampling,
39
+ SizeDict,
40
+ get_image_size,
41
+ infer_channel_dimension_format,
42
+ is_scaled_image,
43
+ make_list_of_images,
44
+ to_numpy_array,
45
+ )
46
+ from ...masking_utils import create_bidirectional_mask, create_causal_mask
47
+ from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPast, BaseModelOutputWithPooling
48
+ from ...modeling_utils import PreTrainedModel
49
+ from ...models.qwen2_vl.image_processing_qwen2_vl import Qwen2VLImageProcessor
50
+ from ...processing_utils import (
51
+ ProcessingKwargs,
52
+ ProcessorMixin,
53
+ Unpack,
54
+ )
55
+ from ...tokenization_utils_base import PreTokenizedInput, TextInput
56
+ from ...utils import TensorType, TransformersKwargs, auto_docstring, can_return_tuple, logging, torch_int
57
+ from ...utils.generic import check_model_inputs
58
+ from ..ernie4_5.configuration_ernie4_5 import Ernie4_5Config
59
+ from ..ernie4_5.modeling_ernie4_5 import (
60
+ Ernie4_5DecoderLayer,
61
+ Ernie4_5MLP,
62
+ Ernie4_5Model,
63
+ Ernie4_5RMSNorm,
64
+ )
65
+ from ..qwen2_5_omni.modeling_qwen2_5_omni import (
66
+ Qwen2_5OmniAttention,
67
+ )
68
+ from ..qwen2_vl.configuration_qwen2_vl import Qwen2VLConfig
69
+ from ..qwen2_vl.modeling_qwen2_vl import (
70
+ Qwen2VLCausalLMOutputWithPast,
71
+ Qwen2VLForConditionalGeneration,
72
+ Qwen2VLModel,
73
+ Qwen2VLModelOutputWithPast,
74
+ Qwen2VLRotaryEmbedding,
75
+ VisionRotaryEmbedding,
76
+ )
77
+ from ..siglip.configuration_siglip import SiglipVisionConfig
78
+ from ..siglip.modeling_siglip import (
79
+ SiglipMLP,
80
+ SiglipVisionEmbeddings,
81
+ )
82
+ from ..video_llama_3.modeling_video_llama_3 import (
83
+ VideoLlama3VisionAttention,
84
+ VideoLlama3VisionEncoder,
85
+ VideoLlama3VisionEncoderLayer,
86
+ )
87
+
88
+
89
+ logger = logging.get_logger(__name__)
90
+
91
+
92
+ def smart_resize(
93
+ height: int,
94
+ width: int,
95
+ factor: int = 28,
96
+ min_pixels: int = 384 * 384,
97
+ max_pixels: int = 1536 * 1536,
98
+ ):
99
+ if height < factor:
100
+ width = round((width * factor) / height)
101
+ height = factor
102
+
103
+ if width < factor:
104
+ height = round((height * factor) / width)
105
+ width = factor
106
+
107
+ if max(height, width) / min(height, width) > 200:
108
+ raise ValueError(
109
+ f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
110
+ )
111
+ h_bar = round(height / factor) * factor
112
+ w_bar = round(width / factor) * factor
113
+ if h_bar * w_bar > max_pixels:
114
+ beta = math.sqrt((height * width) / max_pixels)
115
+ h_bar = math.floor(height / beta / factor) * factor
116
+ w_bar = math.floor(width / beta / factor) * factor
117
+ elif h_bar * w_bar < min_pixels:
118
+ beta = math.sqrt(min_pixels / (height * width))
119
+ h_bar = math.ceil(height * beta / factor) * factor
120
+ w_bar = math.ceil(width * beta / factor) * factor
121
+ return h_bar, w_bar
122
+
123
+
124
+ class PaddleOCRVLImageProcessor(Qwen2VLImageProcessor):
125
+ r"""
126
+ Constructs a PaddleOCRVL image processor that dynamically resizes images based on the original images.
127
+
128
+ Args:
129
+ do_resize (`bool`, *optional*, defaults to `True`):
130
+ Whether to resize the image's (height, width) dimensions.
131
+ size (`dict[str, int]`, *optional*):
132
+ Size of the image after resizing. `shortest_edge` and `longest_edge` keys must be present.
133
+ resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
134
+ Resampling filter to use when resizing the image.
135
+ do_rescale (`bool`, *optional*, defaults to `True`):
136
+ Whether to rescale the image by the specified scale `rescale_factor`.
137
+ rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
138
+ Scale factor to use if rescaling the image.
139
+ do_normalize (`bool`, *optional*, defaults to `True`):
140
+ Whether to normalize the image.
141
+ image_mean (`float` or `list[float]`, *optional*):
142
+ Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
143
+ image_std (`float` or `list[float]`, *optional*):
144
+ Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image.
145
+ do_convert_rgb (`bool`, *optional*, defaults to `True`):
146
+ Whether to convert the image to RGB.
147
+ min_pixels (`int`, *optional*, defaults to `384 * 384`):
148
+ The min pixels of the image to resize the image.
149
+ max_pixels (`int`, *optional*, defaults to `1536 * 1536`):
150
+ The max pixels of the image to resize the image.
151
+ patch_size (`int`, *optional*, defaults to 14):
152
+ The spatial patch size of the vision encoder.
153
+ temporal_patch_size (`int`, *optional*, defaults to 1):
154
+ The temporal patch size of the vision encoder.
155
+ merge_size (`int`, *optional*, defaults to 2):
156
+ The merge size of the vision encoder to llm encoder.
157
+ """
158
+
159
+ model_input_names = [
160
+ "pixel_values",
161
+ "image_grid_thw",
162
+ ]
163
+
164
+ def __init__(
165
+ self,
166
+ do_resize: bool = True,
167
+ size: Optional[dict[str, int]] = None,
168
+ resample: PILImageResampling = PILImageResampling.BICUBIC,
169
+ do_rescale: bool = True,
170
+ rescale_factor: Union[int, float] = 1 / 255,
171
+ do_normalize: bool = True,
172
+ image_mean: Optional[Union[float, list[float]]] = None,
173
+ image_std: Optional[Union[float, list[float]]] = None,
174
+ do_convert_rgb: bool = True,
175
+ min_pixels: int = 384 * 384,
176
+ max_pixels: int = 1536 * 1536,
177
+ patch_size: int = 14,
178
+ temporal_patch_size: int = 1,
179
+ merge_size: int = 2,
180
+ **kwargs,
181
+ ) -> None:
182
+ super().__init__()
183
+
184
+ def _preprocess(
185
+ self,
186
+ images: ImageInput,
187
+ do_resize: Optional[bool] = None,
188
+ size: Optional[dict[str, int]] = None,
189
+ resample: PILImageResampling = None,
190
+ do_rescale: Optional[bool] = None,
191
+ rescale_factor: Optional[float] = None,
192
+ do_normalize: Optional[bool] = None,
193
+ image_mean: Optional[Union[float, list[float]]] = None,
194
+ image_std: Optional[Union[float, list[float]]] = None,
195
+ patch_size: Optional[int] = None,
196
+ temporal_patch_size: Optional[int] = None,
197
+ merge_size: Optional[int] = None,
198
+ do_convert_rgb: Optional[bool] = None,
199
+ data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
200
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
201
+ ):
202
+ """
203
+ Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
204
+ Args:
205
+ images (`ImageInput`):
206
+ Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
207
+ do_resize (`bool`, *optional*, defaults to `self.do_resize`):
208
+ Whether to resize the image.
209
+ size (`Dict[str, int]`, *optional*, defaults to `self.size`):
210
+ Size of the image after resizing. `shortest_edge` and `longest_edge` keys must be present.
211
+ resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
212
+ Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
213
+ do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
214
+ Whether to rescale the image.
215
+ rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
216
+ Scale factor to use if rescaling the image.
217
+ do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
218
+ Whether to normalize the image.
219
+ image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
220
+ Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
221
+ image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
222
+ Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
223
+ patch_size (`int`, *optional*, defaults to `self.patch_size`):
224
+ The spatial patch size of the vision encoder.
225
+ temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`):
226
+ The temporal patch size of the vision encoder.
227
+ merge_size (`int`, *optional*, defaults to `self.merge_size`):
228
+ The merge size of the vision encoder to llm encoder.
229
+ do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
230
+ Whether to convert the image to RGB.
231
+ data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
232
+ The channel dimension format for the output image. Can be one of:
233
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
234
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
235
+ - Unset: Use the channel dimension format of the input image.
236
+ input_data_format (`ChannelDimension` or `str`, *optional*):
237
+ The channel dimension format for the input image. Can be one of:
238
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
239
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
240
+ - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
241
+ """
242
+ images = make_list_of_images(images)
243
+ images = self.fetch_images(images)
244
+
245
+ if do_convert_rgb:
246
+ images = [convert_to_rgb(image) for image in images]
247
+
248
+ # All transformations expect numpy arrays.
249
+ images = [to_numpy_array(image) for image in images]
250
+
251
+ if is_scaled_image(images[0]) and do_rescale:
252
+ logger.warning_once(
253
+ "It looks like you are trying to rescale already rescaled images. If the input"
254
+ " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
255
+ )
256
+ if input_data_format is None:
257
+ # We assume that all images have the same channel dimension format.
258
+ input_data_format = infer_channel_dimension_format(images[0])
259
+
260
+ height, width = get_image_size(images[0], channel_dim=input_data_format)
261
+ resized_height, resized_width = height, width
262
+ processed_images = []
263
+
264
+ for image in images:
265
+ if do_resize:
266
+ resized_height, resized_width = smart_resize(
267
+ height,
268
+ width,
269
+ factor=patch_size * merge_size,
270
+ min_pixels=size["shortest_edge"],
271
+ max_pixels=size["longest_edge"],
272
+ )
273
+ image = resize(
274
+ image,
275
+ size=(resized_height, resized_width),
276
+ resample=resample,
277
+ input_data_format=input_data_format,
278
+ )
279
+
280
+ if do_rescale:
281
+ image = self.rescale(image, scale=rescale_factor, input_data_format=input_data_format)
282
+
283
+ if do_normalize:
284
+ image = self.normalize(
285
+ image=image,
286
+ mean=image_mean,
287
+ std=image_std,
288
+ input_data_format=input_data_format,
289
+ )
290
+ image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
291
+ processed_images.append(image)
292
+
293
+ patches = np.array(processed_images)
294
+ if data_format == ChannelDimension.LAST:
295
+ patches = patches.transpose(0, 3, 1, 2)
296
+ if patches.shape[0] == 1:
297
+ patches = np.tile(patches, (temporal_patch_size, 1, 1, 1))
298
+
299
+ channel = patches.shape[1]
300
+ grid_t = patches.shape[0] // temporal_patch_size
301
+ grid_h, grid_w = (
302
+ resized_height // patch_size,
303
+ resized_width // patch_size,
304
+ )
305
+ patches = patches.reshape(
306
+ grid_t,
307
+ temporal_patch_size,
308
+ channel,
309
+ grid_h,
310
+ patch_size,
311
+ grid_w,
312
+ patch_size,
313
+ )
314
+ patches = patches.transpose(0, 3, 5, 2, 1, 4, 6)
315
+ if temporal_patch_size != 1:
316
+ raise ValueError(f"temporal_patch_size must be 1!, but got {temporal_patch_size}!")
317
+ flatten_patches = patches.reshape(grid_t * grid_h * grid_w, channel, patch_size, patch_size)
318
+ return flatten_patches, (grid_t, grid_h, grid_w)
319
+
320
+
321
+ class PaddleOCRVLImageProcessorFast(BaseImageProcessorFast):
322
+ def __init__(
323
+ self,
324
+ do_resize: bool = True,
325
+ size: Optional[dict[str, int]] = None,
326
+ resample: PILImageResampling = PILImageResampling.BICUBIC,
327
+ do_rescale: bool = True,
328
+ rescale_factor: Union[int, float] = 1 / 255,
329
+ do_normalize: bool = True,
330
+ image_mean: Optional[Union[float, list[float]]] = None,
331
+ image_std: Optional[Union[float, list[float]]] = None,
332
+ do_convert_rgb: bool = True,
333
+ min_pixels: int = 384 * 384,
334
+ max_pixels: int = 1536 * 1536,
335
+ patch_size: int = 14,
336
+ temporal_patch_size: int = 1,
337
+ merge_size: int = 2,
338
+ **kwargs,
339
+ ) -> None:
340
+ super().__init__(**kwargs)
341
+ if size is not None and ("shortest_edge" not in size or "longest_edge" not in size):
342
+ raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
343
+ else:
344
+ size = {"shortest_edge": 384 * 384, "longest_edge": 1536 * 1536}
345
+ # backward compatibility: override size with min_pixels and max_pixels if they are provided
346
+ if min_pixels is not None:
347
+ size["shortest_edge"] = min_pixels
348
+ if max_pixels is not None:
349
+ size["longest_edge"] = max_pixels
350
+ self.min_pixels = size["shortest_edge"]
351
+ self.max_pixels = size["longest_edge"]
352
+ self.size = size
353
+
354
+ self.do_resize = do_resize
355
+ self.resample = resample
356
+ self.do_rescale = do_rescale
357
+ self.rescale_factor = rescale_factor
358
+ self.do_normalize = do_normalize
359
+ self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
360
+ self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
361
+
362
+ self.patch_size = patch_size
363
+ self.temporal_patch_size = temporal_patch_size
364
+ self.merge_size = merge_size
365
+ self.do_convert_rgb = do_convert_rgb
366
+
367
+ def _preprocess(
368
+ self,
369
+ images: list["torch.Tensor"],
370
+ do_resize: bool,
371
+ size: SizeDict,
372
+ interpolation: Optional["F.InterpolationMode"],
373
+ do_rescale: bool,
374
+ rescale_factor: float,
375
+ do_normalize: bool,
376
+ image_mean: Optional[Union[float, list[float]]],
377
+ image_std: Optional[Union[float, list[float]]],
378
+ disable_grouping: Optional[bool],
379
+ return_tensors: Optional[Union[str, TensorType]],
380
+ patch_size: Optional[int] = None,
381
+ temporal_patch_size: Optional[int] = None,
382
+ merge_size: Optional[int] = None,
383
+ **kwargs,
384
+ ):
385
+ patch_size = patch_size if patch_size is not None else self.patch_size
386
+ temporal_patch_size = temporal_patch_size if temporal_patch_size is not None else self.temporal_patch_size
387
+ merge_size = merge_size if merge_size is not None else self.merge_size
388
+
389
+ grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
390
+ resized_images_grouped = {}
391
+ for shape, stacked_images in grouped_images.items():
392
+ height, width = stacked_images.shape[-2:]
393
+ if do_resize:
394
+ resized_height, resized_width = smart_resize(
395
+ height,
396
+ width,
397
+ factor=patch_size * merge_size,
398
+ min_pixels=size["shortest_edge"],
399
+ max_pixels=size["longest_edge"],
400
+ )
401
+ stacked_images = self.resize(
402
+ image=stacked_images,
403
+ size=SizeDict(height=resized_height, width=resized_width),
404
+ interpolation=interpolation,
405
+ )
406
+ resized_images_grouped[shape] = stacked_images
407
+ resized_images = reorder_images(resized_images_grouped, grouped_images_index)
408
+
409
+ # Group images by size for further processing
410
+ # Needed in case do_resize is False, or resize returns images with different sizes
411
+ grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
412
+ processed_images_grouped = {}
413
+ processed_grids = {}
414
+ for shape, stacked_images in grouped_images.items():
415
+ resized_height, resized_width = stacked_images.shape[-2:]
416
+ # Fused rescale and normalize
417
+ patches = self.rescale_and_normalize(
418
+ stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
419
+ )
420
+
421
+ if patches.ndim == 4:
422
+ # add a temporal dimension if we have images
423
+ patches = patches.unsqueeze(1)
424
+ if patches.shape[1] % temporal_patch_size != 0:
425
+ repeats = patches[:, -1:].repeat(1, temporal_patch_size - 1, 1, 1, 1)
426
+ patches = torch.cat([patches, repeats], dim=1)
427
+
428
+ batch_size, grid_t, channel = patches.shape[:3]
429
+ grid_t = grid_t // temporal_patch_size
430
+ grid_h, grid_w = (
431
+ resized_height // patch_size,
432
+ resized_width // patch_size,
433
+ )
434
+ patches = patches.view(
435
+ batch_size,
436
+ grid_t,
437
+ temporal_patch_size,
438
+ channel,
439
+ grid_h,
440
+ patch_size,
441
+ grid_w,
442
+ patch_size,
443
+ )
444
+ patches = patches.permute(0, 1, 4, 6, 3, 2, 5, 7)
445
+ flatten_patches = patches.reshape(batch_size, grid_t * grid_h * grid_w, channel, patch_size, patch_size)
446
+
447
+ processed_images_grouped[shape] = flatten_patches
448
+ processed_grids[shape] = [[grid_t, grid_h, grid_w]] * batch_size
449
+
450
+ processed_images = reorder_images(processed_images_grouped, grouped_images_index)
451
+ processed_grids = reorder_images(processed_grids, grouped_images_index)
452
+ pixel_values = torch.cat(processed_images, dim=0)
453
+ image_grid_thw = torch.tensor(processed_grids)
454
+
455
+ return BatchFeature(
456
+ data={"pixel_values": pixel_values, "image_grid_thw": image_grid_thw}, tensor_type=return_tensors
457
+ )
458
+
459
+
460
+ class PaddleOCRVLProcessorKwargs(ProcessingKwargs, total=False):
461
+ _defaults = {
462
+ "text_kwargs": {
463
+ "padding": False,
464
+ },
465
+ }
466
+
467
+
468
+ class PaddleOCRVLProcessor(ProcessorMixin):
469
+ r"""
470
+ [`PaddleOCRVLProcessor`] offers all the functionalities of [`PaddleOCRVLImageProcessor`] and [`LLamaTokenizerFast`]. See the
471
+ [`~PaddleOCRVLProcessor.__call__`] and [`~PaddleOCRVLProcessor.decode`] for more information.
472
+ Args:
473
+ image_processor ([`PaddleOCRVLImageProcessor`], *optional*):
474
+ The image processor is a required input.
475
+ tokenizer ([`LLamaTokenizerFast`], *optional*):
476
+ The tokenizer is a required input.
477
+ chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
478
+ in a chat into a tokenizable string.
479
+ """
480
+
481
+ image_processor_class = "AutoImageProcessor"
482
+ tokenizer_class = "AutoTokenizer"
483
+
484
+ def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
485
+ self.image_token = tokenizer.image_token
486
+ super().__init__(image_processor, tokenizer, chat_template=chat_template)
487
+
488
+ def __call__(
489
+ self,
490
+ images: ImageInput = None,
491
+ text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
492
+ **kwargs: Unpack[PaddleOCRVLProcessorKwargs],
493
+ ) -> BatchFeature:
494
+ """
495
+ Args:
496
+ images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
497
+ The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
498
+ tensor. Both channels-first and channels-last formats are supported.
499
+ text (`str`, `List[str]`, `List[List[str]]`):
500
+ The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
501
+ (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
502
+ `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
503
+ return_tensors (`str` or [`~utils.TensorType`], *optional*):
504
+ If set, will return tensors of a particular framework. Acceptable values are:
505
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
506
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
507
+ - `'np'`: Return NumPy `np.ndarray` objects.
508
+ - `'jax'`: Return JAX `jnp.ndarray` objects.
509
+
510
+ Returns:
511
+ [`BatchFeature`]: A [`BatchFeature`] with the following fields:
512
+
513
+ - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
514
+ - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
515
+ `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
516
+ `None`).
517
+ - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
518
+ - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
519
+ """
520
+ output_kwargs = self._merge_kwargs(
521
+ PaddleOCRVLProcessorKwargs,
522
+ tokenizer_init_kwargs=self.tokenizer.init_kwargs,
523
+ **kwargs,
524
+ )
525
+
526
+ if images is not None:
527
+ image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
528
+ image_grid_thw = image_inputs["image_grid_thw"]
529
+
530
+ else:
531
+ image_inputs = {}
532
+ image_grid_thw = None
533
+
534
+ if not isinstance(text, list):
535
+ text = [text]
536
+
537
+ text = text.copy()
538
+
539
+ if image_grid_thw is not None:
540
+ index = 0
541
+ for i in range(len(text)):
542
+ while self.image_token in text[i]:
543
+ text[i] = text[i].replace(
544
+ self.image_token,
545
+ "<|placeholder|>"
546
+ * (
547
+ image_grid_thw[index].prod()
548
+ // self.image_processor.merge_size
549
+ // self.image_processor.merge_size
550
+ ),
551
+ 1,
552
+ )
553
+ index += 1
554
+ text[i] = text[i].replace("<|placeholder|>", self.image_token)
555
+
556
+ text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
557
+
558
+ return BatchFeature(data={**text_inputs, **image_inputs})
559
+
560
+
561
+ class PaddleOCRVisionConfig(SiglipVisionConfig):
562
+ r"""
563
+ This is the configuration class to store the configuration of a [`PaddleOCRVisionModel`]. It is used to instantiate a
564
+ PaddleOCRVL vision encoder according to the specified arguments, defining the model architecture. Instantiating a
565
+ configuration with the defaults will yield a similar configuration to that of the vision encoder of the PaddleOCRVL
566
+ [PaddlePaddle/PaddleOCRVL](https://huggingface.co/PaddlePaddle/PaddleOCR-VL) architecture.
567
+
568
+ Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
569
+ documentation from [`PreTrainedConfig`] for more information.
570
+
571
+ Args:
572
+ hidden_size (`int`, *optional*, defaults to 1152):
573
+ Dimensionality of the encoder layers and the pooler layer.
574
+ intermediate_size (`int`, *optional*, defaults to 4304):
575
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
576
+ num_hidden_layers (`int`, *optional*, defaults to 27):
577
+ Number of hidden layers in the Transformer encoder.
578
+ num_attention_heads (`int`, *optional*, defaults to 16):
579
+ Number of attention heads for each attention layer in the Transformer encoder.
580
+ num_channels (`int`, *optional*, defaults to 3):
581
+ Number of channels in the input images.
582
+ image_size (`int`, *optional*, defaults to 384):
583
+ The size (resolution) of each image.
584
+ patch_size (`int`, *optional*, defaults to 14):
585
+ The size (resolution) of each patch.
586
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
587
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
588
+ `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
589
+ layer_norm_eps (`float`, *optional*, defaults to 1e-06):
590
+ The epsilon used by the layer normalization layers.
591
+ attention_dropout (`float`, *optional*, defaults to 0.0):
592
+ The dropout ratio for the attention probabilities.
593
+ spatial_merge_size (`int`, *optional*, defaults to 2):
594
+ The size used for merging spatial dimensions.
595
+
596
+ Example:
597
+
598
+ ```python
599
+ >>> from transformers import PaddleOCRVisionConfig, PaddleOCRVisionModel
600
+
601
+ >>> # Initializing a PaddleOCRVisionConfig with PaddlePaddle/PaddleOCR-VL style configuration
602
+ >>> configuration = PaddleOCRVisionConfig()
603
+
604
+ >>> # Initializing a PaddleOCRVisionModel (with random weights) from the PaddlePaddle/PaddleOCR-VL style configuration
605
+ >>> model = PaddleOCRVisionModel(configuration)
606
+
607
+ >>> # Accessing the model configuration
608
+ >>> configuration = model.config
609
+ ```
610
+ """
611
+
612
+ model_type = "paddleocr_vl_vision"
613
+ base_config_key = "vision_config"
614
+
615
+ def __init__(
616
+ self,
617
+ hidden_size=1152,
618
+ intermediate_size=4304,
619
+ num_hidden_layers=27,
620
+ num_attention_heads=16,
621
+ num_channels=3,
622
+ image_size=384,
623
+ patch_size=14,
624
+ hidden_act="gelu_pytorch_tanh",
625
+ layer_norm_eps=1e-6,
626
+ attention_dropout=0.0,
627
+ spatial_merge_size=2,
628
+ **kwargs,
629
+ ):
630
+ super().__init__()
631
+ self.spatial_merge_size = spatial_merge_size
632
+
633
+
634
+ class PaddleOCRTextConfig(Ernie4_5Config):
635
+ model_type = "paddleocr_vl_text"
636
+
637
+
638
+ class PaddleOCRVLConfig(Qwen2VLConfig):
639
+ r"""
640
+ This is the configuration class to store the configuration of a [`PaddleOCRVLForConditionalGeneration`]. It is used to instantiate a
641
+ PaddleOCRVL model according to the specified arguments, defining the model architecture. Instantiating a configuration
642
+ with the defaults will yield a similar configuration to that of
643
+ PaddleOCRVL [PaddlePaddle/PaddleOCR-VL](https://huggingface.co/PaddlePaddle/PaddleOCR-VL).
644
+
645
+ Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
646
+ documentation from [`PreTrainedConfig`] for more information.
647
+
648
+
649
+ Args:
650
+ text_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `PaddleOCRTextConfig`):
651
+ The config object or dictionary of the text backbone.
652
+ vision_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `PaddleOCRVisionConfig`):
653
+ The config object or dictionary of the vision backbone.
654
+ image_token_id (`int`, *optional*, defaults to 100295):
655
+ The image token index to encode the image prompt.
656
+ video_token_id (`int`, *optional*, defaults to 100296):
657
+ The video token index to encode the image prompt.
658
+ vision_start_token_id (`int`, *optional*, defaults to 101305):
659
+ The token index to denote start of vision input.
660
+ vision_end_token_id (`int`, *optional*, defaults to 101306):
661
+ The token index to denote end of vision input.
662
+
663
+ ```python
664
+ >>> from transformers import PaddleOCRVLForConditionalGeneration, PaddleOCRVLConfig
665
+
666
+ >>> # Initializing a PaddleOCRVL style configuration
667
+ >>> configuration = PaddleOCRVLConfig()
668
+
669
+ >>> # Initializing a model from the PaddleOCRVL style configuration
670
+ >>> model = PaddleOCRVLForConditionalGeneration(configuration)
671
+
672
+ >>> # Accessing the model configuration
673
+ >>> configuration = model.config
674
+ ```"""
675
+
676
+ sub_configs = {"vision_config": PaddleOCRVisionConfig, "text_config": PaddleOCRTextConfig}
677
+
678
+ def __init__(
679
+ self,
680
+ text_config=None,
681
+ vision_config=None,
682
+ image_token_id=100295,
683
+ video_token_id=100296,
684
+ vision_start_token_id=101305,
685
+ vision_end_token_id=101306,
686
+ **kwargs,
687
+ ):
688
+ super().__init__()
689
+
690
+
691
+ class PaddleOCRProjector(nn.Module):
692
+ def __init__(self, config: PaddleOCRVLConfig):
693
+ super().__init__()
694
+ self.merge_kernel_size = (config.vision_config.spatial_merge_size, config.vision_config.spatial_merge_size)
695
+
696
+ hidden_size = config.vision_config.hidden_size * self.merge_kernel_size[0] * self.merge_kernel_size[1]
697
+
698
+ self.pre_norm = torch.nn.LayerNorm(config.vision_config.hidden_size, eps=1e-05)
699
+ self.linear_1 = nn.Linear(hidden_size, hidden_size, bias=True)
700
+ self.act = GELUActivation()
701
+ self.linear_2 = nn.Linear(hidden_size, config.text_config.hidden_size, bias=True)
702
+
703
+ def forward(self, image_features: torch.Tensor, image_grid_thw: torch.Tensor) -> torch.Tensor:
704
+ image_features_chunks = image_features.split(image_grid_thw.prod(dim=1).tolist(), dim=0)
705
+ m1, m2 = self.merge_kernel_size
706
+
707
+ processed_features = []
708
+ for image_feature, image_grid in zip(image_features_chunks, image_grid_thw):
709
+ image_feature = self.pre_norm(image_feature)
710
+ t, h, w = image_grid
711
+ d = image_feature.shape[-1]
712
+ h_block = h // m1
713
+ w_block = w // m2
714
+
715
+ image_feature = image_feature.reshape(t, h_block, m1, w_block, m2, d)
716
+ image_feature = image_feature.transpose(2, 3)
717
+ image_feature = image_feature.reshape(t * h_block * w_block, m1 * m2 * d)
718
+
719
+ hidden_states = self.linear_1(image_feature)
720
+ hidden_states = self.act(hidden_states)
721
+ hidden_states = self.linear_2(hidden_states)
722
+ processed_features.append(hidden_states)
723
+
724
+ return torch.cat(processed_features, dim=0)
725
+
726
+
727
+ class PaddleOCRVisionRotaryEmbedding(VisionRotaryEmbedding):
728
+ pass
729
+
730
+
731
+ class PaddleOCRRotaryEmbedding(Qwen2VLRotaryEmbedding):
732
+ pass
733
+
734
+
735
+ class PaddleOCRMLP(Ernie4_5MLP):
736
+ def __init__(self, config: PaddleOCRTextConfig):
737
+ super().__init__()
738
+
739
+
740
+ class PaddleOCRAttention(Qwen2_5OmniAttention):
741
+ def __init__(self, config: PaddleOCRVLConfig, layer_idx: Optional[int] = None):
742
+ super().__init__()
743
+
744
+ self.attention_dropout = 0.0
745
+ self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.use_bias)
746
+ self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.use_bias)
747
+ self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.use_bias)
748
+ self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.use_bias)
749
+
750
+
751
+ class PaddleOCRRMSNorm(Ernie4_5RMSNorm):
752
+ pass
753
+
754
+
755
+ class PaddleOCRDecoderLayer(Ernie4_5DecoderLayer):
756
+ def __init__(self, config: PaddleOCRTextConfig, layer_idx: int):
757
+ super().__init__()
758
+
759
+
760
+ @auto_docstring
761
+ class PaddleOCRVLPreTrainedModel(PreTrainedModel):
762
+ config: PaddleOCRVLConfig
763
+ base_model_prefix = "model"
764
+ supports_gradient_checkpointing = True
765
+ _no_split_modules = ["PaddleOCRDecoderLayer"]
766
+ _skip_keys_device_placement = ["past_key_values"]
767
+ _supports_flash_attn = True
768
+ _supports_sdpa = True
769
+ _supports_flex_attn = True
770
+
771
+ _can_compile_fullgraph = True
772
+ _supports_attention_backend = True
773
+
774
+ _can_record_outputs = {
775
+ "hidden_states": PaddleOCRDecoderLayer,
776
+ "attentions": PaddleOCRAttention,
777
+ }
778
+
779
+
780
+ class PaddleOCRTextModel(PaddleOCRVLPreTrainedModel, Ernie4_5Model):
781
+ def __init__(self, config: PaddleOCRTextConfig):
782
+ super().__init__(config)
783
+
784
+ @check_model_inputs
785
+ @auto_docstring
786
+ def forward(
787
+ self,
788
+ input_ids: Optional[torch.LongTensor] = None,
789
+ attention_mask: Optional[torch.Tensor] = None,
790
+ position_ids: Optional[torch.LongTensor] = None,
791
+ past_key_values: Optional[Cache] = None,
792
+ inputs_embeds: Optional[torch.FloatTensor] = None,
793
+ cache_position: Optional[torch.LongTensor] = None,
794
+ use_cache: Optional[bool] = None,
795
+ **kwargs: Unpack[TransformersKwargs],
796
+ ) -> BaseModelOutputWithPast:
797
+ if (input_ids is None) ^ (inputs_embeds is not None):
798
+ raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
799
+
800
+ if inputs_embeds is None:
801
+ inputs_embeds: torch.Tensor = self.embed_tokens(input_ids)
802
+
803
+ if use_cache and past_key_values is None:
804
+ past_key_values = DynamicCache(config=self.config)
805
+
806
+ if cache_position is None:
807
+ past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
808
+ cache_position: torch.Tensor = (
809
+ torch.arange(inputs_embeds.shape[1], device=inputs_embeds.device) + past_seen_tokens
810
+ )
811
+
812
+ if position_ids is None:
813
+ position_ids = cache_position.view(1, 1, -1).expand(3, inputs_embeds.shape[0], -1)
814
+ elif position_ids.ndim == 2:
815
+ position_ids = position_ids[None, ...].expand(3, position_ids.shape[0], -1)
816
+
817
+ if position_ids.ndim == 3 and position_ids.shape[0] == 4:
818
+ text_position_ids = position_ids[0]
819
+ position_ids = position_ids[1:]
820
+ else:
821
+ text_position_ids = None
822
+
823
+ causal_mask = create_causal_mask(
824
+ config=self.config,
825
+ input_embeds=inputs_embeds,
826
+ attention_mask=attention_mask,
827
+ cache_position=cache_position,
828
+ past_key_values=past_key_values,
829
+ position_ids=text_position_ids,
830
+ )
831
+
832
+ hidden_states = inputs_embeds
833
+ position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids)
834
+
835
+ for decoder_layer in self.layers[: self.config.num_hidden_layers]:
836
+ hidden_states = decoder_layer(
837
+ hidden_states,
838
+ attention_mask=causal_mask,
839
+ position_embeddings=position_embeddings,
840
+ position_ids=text_position_ids,
841
+ past_key_values=past_key_values,
842
+ use_cache=use_cache,
843
+ cache_position=cache_position,
844
+ **kwargs,
845
+ )
846
+
847
+ hidden_states = self.norm(hidden_states)
848
+ return BaseModelOutputWithPast(
849
+ last_hidden_state=hidden_states,
850
+ past_key_values=past_key_values,
851
+ )
852
+
853
+
854
+ class PaddleOCRVisionModel(PaddleOCRVLPreTrainedModel):
855
+ config: PaddleOCRVisionConfig
856
+ main_input_name = "pixel_values"
857
+ input_modalities = "image"
858
+
859
+ def __init__(self, config: PaddleOCRVisionConfig):
860
+ super().__init__(config)
861
+
862
+ self.vision_model = PaddleOCRVisionTransformer(config)
863
+
864
+ # Initialize weights and apply final processing
865
+ self.post_init()
866
+
867
+ def forward(
868
+ self,
869
+ pixel_values: torch.FloatTensor,
870
+ cu_seqlens: torch.Tensor,
871
+ image_grid_thw: Optional[list[Union[tuple[int, int, int], list[tuple[int, int, int]]]]] = None,
872
+ **kwargs,
873
+ ) -> BaseModelOutputWithPooling:
874
+ """
875
+ Args:
876
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, image_channels, patch_size, patch_size)`):
877
+ The tensors corresponding to the input images.
878
+ cu_seqlens (`torch.Tensor` of shape `(num_images + 1,)`):
879
+ The cumulative sequence lengths of each image or video feature.
880
+ image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
881
+ The temporal, height and width of feature shape of each image in LLM.
882
+ """
883
+ return self.vision_model(
884
+ pixel_values=pixel_values,
885
+ cu_seqlens=cu_seqlens,
886
+ image_grid_thw=image_grid_thw,
887
+ )
888
+
889
+
890
+ class PaddleOCRVisionEmbeddings(SiglipVisionEmbeddings):
891
+ def __init__(self, config: PaddleOCRVisionConfig):
892
+ super().__init__()
893
+
894
+ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
895
+ num_positions = self.position_embedding.weight.shape[0]
896
+
897
+ patch_pos_embed = self.position_embedding.weight.unsqueeze(0)
898
+
899
+ dim = embeddings.shape[-1]
900
+
901
+ sqrt_num_positions = torch_int(num_positions**0.5)
902
+ patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
903
+ patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
904
+
905
+ patch_pos_embed = nn.functional.interpolate(
906
+ patch_pos_embed,
907
+ size=(height, width),
908
+ mode="bilinear",
909
+ align_corners=False,
910
+ )
911
+
912
+ patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
913
+ return patch_pos_embed
914
+
915
+ def forward(
916
+ self,
917
+ pixel_values: torch.FloatTensor,
918
+ image_grid_thw: Optional[list[Union[tuple[int, int, int], list[tuple[int, int, int]]]]] = None,
919
+ ) -> torch.Tensor:
920
+ """
921
+ Args:
922
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, image_channels, patch_size, patch_size)`):
923
+ The tensors corresponding to the input images.
924
+ image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
925
+ The temporal, height and width of feature shape of each image in LLM.
926
+ """
927
+ batch_size, squence_len, channel, height, width = pixel_values.shape
928
+ target_dtype = self.patch_embedding.weight.dtype
929
+ pixel_values = pixel_values.reshape(batch_size * squence_len, channel, height, width)
930
+ patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid]
931
+ embeddings = patch_embeds.flatten(-2).squeeze(-1)
932
+ embeddings = embeddings.reshape(batch_size, squence_len, -1)
933
+
934
+ start = 0
935
+ embeddings = embeddings.squeeze(0)
936
+ tmp_embeddings = []
937
+ for image_grid in image_grid_thw:
938
+ t, h, w = image_grid
939
+ end = start + t * h * w
940
+ image_embeddings = embeddings[start:end, :]
941
+ position_embedding = self.interpolate_pos_encoding(image_embeddings, h, w).squeeze(0).repeat(t, 1)
942
+ image_embeddings = image_embeddings + position_embedding
943
+ tmp_embeddings.append(image_embeddings)
944
+ start = end
945
+ embeddings = torch.concat(tmp_embeddings, dim=0)
946
+
947
+ return embeddings
948
+
949
+
950
+ class PaddleOCRVisionAttention(VideoLlama3VisionAttention):
951
+ def __init__(self, config: PaddleOCRVisionConfig):
952
+ super().__init__()
953
+
954
+
955
+ class PaddleOCRVisionMLP(SiglipMLP):
956
+ def __init__(self, config: PaddleOCRVisionConfig):
957
+ super().__init__()
958
+
959
+
960
+ class PaddleOCRVisionEncoderLayer(VideoLlama3VisionEncoderLayer):
961
+ def __init__(self, config: PaddleOCRVisionConfig):
962
+ super().__init__()
963
+
964
+
965
+ class PaddleOCRVisionEncoder(VideoLlama3VisionEncoder):
966
+ def __init__(self, config: PaddleOCRVisionConfig):
967
+ super().__init__()
968
+ embed_dim = config.hidden_size
969
+ num_heads = config.num_attention_heads
970
+ head_dim = embed_dim // num_heads
971
+ self.rotary_pos_emb = PaddleOCRVisionRotaryEmbedding(head_dim // 2)
972
+
973
+ def forward(
974
+ self,
975
+ inputs_embeds: torch.FloatTensor,
976
+ cu_seqlens: torch.Tensor,
977
+ attention_mask: Optional[torch.Tensor] = None,
978
+ image_grid_thw: Optional[list[Union[tuple[int, int, int], list[tuple[int, int, int]]]]] = None,
979
+ ) -> BaseModelOutput:
980
+ """
981
+ Args:
982
+ inputs_embeds (`torch.FloatTensor` of shape `(sequence_length, hidden_size)`, *optional*):
983
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
984
+ This is useful if you want more control over how to convert `input_ids` indices into associated vectors
985
+ than the model's internal embedding lookup matrix.
986
+ cu_seqlens (`torch.Tensor` of shape `(num_images + 1,)`):
987
+ The cumulative sequence lengths of each image or video feature.
988
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
989
+ The attention_mask used in forward function shape [batch_size X sequence_length] if not None.
990
+ image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
991
+ The temporal, height and width of feature shape of each image in LLM.
992
+ """
993
+ device = inputs_embeds.device
994
+ hidden_states = inputs_embeds
995
+ attention_mask = create_bidirectional_mask(
996
+ config=self.config,
997
+ input_embeds=inputs_embeds,
998
+ attention_mask=attention_mask,
999
+ )
1000
+ split_hids = []
1001
+ split_wids = []
1002
+ for t, h, w in image_grid_thw:
1003
+ image_pids = torch.arange(t * h * w, device=device) % (h * w)
1004
+ sample_hids = image_pids // w
1005
+ sample_wids = image_pids % w
1006
+ split_hids.append(sample_hids)
1007
+ split_wids.append(sample_wids)
1008
+ width_position_ids = torch.concat(split_wids, dim=0)
1009
+ height_position_ids = torch.concat(split_hids, dim=0)
1010
+
1011
+ pids = torch.stack([height_position_ids, width_position_ids], dim=-1)
1012
+ max_grid_size = pids.max() + 1
1013
+ rotary_embeddings_max_grid = self.rotary_pos_emb(max_grid_size)
1014
+ rotary_embeddings = rotary_embeddings_max_grid[pids].flatten(1)
1015
+ rotary_embeddings = rotary_embeddings.repeat(1, 2)
1016
+ position_embeddings = (rotary_embeddings.cos(), rotary_embeddings.sin())
1017
+
1018
+ for encoder_layer in self.layers:
1019
+ hidden_states = encoder_layer(
1020
+ hidden_states,
1021
+ cu_seqlens=cu_seqlens,
1022
+ position_embeddings=position_embeddings,
1023
+ )
1024
+
1025
+ return BaseModelOutput(
1026
+ last_hidden_state=hidden_states,
1027
+ )
1028
+
1029
+
1030
+ class PaddleOCRVisionTransformer(PaddleOCRVLPreTrainedModel):
1031
+ def __init__(self, config: PaddleOCRVisionConfig):
1032
+ super().__init__(config)
1033
+ self.config = config
1034
+ embed_dim = config.hidden_size
1035
+
1036
+ self.embeddings = PaddleOCRVisionEmbeddings(config)
1037
+ self.encoder = PaddleOCRVisionEncoder(config)
1038
+ self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
1039
+
1040
+ def forward(
1041
+ self,
1042
+ pixel_values: torch.FloatTensor,
1043
+ cu_seqlens: torch.Tensor,
1044
+ attention_mask: Optional[torch.Tensor] = None,
1045
+ image_grid_thw: Optional[list[Union[tuple[int, int, int], list[tuple[int, int, int]]]]] = None,
1046
+ **kwargs,
1047
+ ) -> BaseModelOutputWithPooling:
1048
+ """
1049
+ Args:
1050
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, patch_size * patch_size * image_channels)`):
1051
+ The tensors corresponding to the input images.
1052
+ cu_seqlens (`torch.Tensor` of shape `(num_images + 1,)`):
1053
+ The cumulative sequence lengths of each image or video feature.
1054
+ attention_mask (`torch.Tensor`, *optional*):
1055
+ The attention_mask used in forward function shape [batch_size X sequence_length] if not None.
1056
+ image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
1057
+ The temporal, height and width of feature shape of each image in LLM.
1058
+ """
1059
+ hidden_states = self.embeddings(pixel_values, image_grid_thw=image_grid_thw)
1060
+
1061
+ encoder_outputs: BaseModelOutput = self.encoder(
1062
+ inputs_embeds=hidden_states,
1063
+ cu_seqlens=cu_seqlens,
1064
+ attention_mask=attention_mask,
1065
+ image_grid_thw=image_grid_thw,
1066
+ )
1067
+
1068
+ last_hidden_state = encoder_outputs.last_hidden_state
1069
+ last_hidden_state = self.post_layernorm(last_hidden_state)
1070
+
1071
+ return BaseModelOutputWithPooling(
1072
+ last_hidden_state=last_hidden_state,
1073
+ pooler_output=None,
1074
+ hidden_states=encoder_outputs.hidden_states,
1075
+ attentions=encoder_outputs.attentions,
1076
+ )
1077
+
1078
+
1079
+ class PaddleOCRVLModelOutputWithPast(Qwen2VLModelOutputWithPast):
1080
+ pass
1081
+
1082
+
1083
+ class PaddleOCRVLCausalLMOutputWithPast(Qwen2VLCausalLMOutputWithPast):
1084
+ pass
1085
+
1086
+
1087
+ class PaddleOCRVLModel(Qwen2VLModel):
1088
+ _checkpoint_conversion_mapping = {"^model": "language_model"}
1089
+ _keys_to_ignore_on_load_unexpected = ["packing_position_embedding", "vision_model.head"]
1090
+
1091
+ def __init__(self, config: PaddleOCRVLConfig):
1092
+ super().__init__(config)
1093
+ self.visual = PaddleOCRVisionModel._from_config(config.vision_config)
1094
+ self.projector = PaddleOCRProjector(config)
1095
+ self.language_model = PaddleOCRTextModel._from_config(config.text_config)
1096
+ self.rope_deltas = None
1097
+
1098
+ self.post_init()
1099
+
1100
+ def get_input_embeddings(self):
1101
+ return self.language_model.embed_tokens
1102
+
1103
+ def set_input_embeddings(self, value):
1104
+ self.language_model.embed_tokens = value
1105
+
1106
+ def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: Optional[torch.LongTensor] = None):
1107
+ """
1108
+ Encodes images into continuous embeddings that can be forwarded to the language model.
1109
+
1110
+ Args:
1111
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
1112
+ The tensors corresponding to the input images.
1113
+ image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
1114
+ The temporal, height and width of feature shape of each image in LLM.
1115
+ """
1116
+ pixel_values = pixel_values.type(self.visual.dtype).unsqueeze(0)
1117
+ cu_seqlens = torch.repeat_interleave(image_grid_thw[:, 1] * image_grid_thw[:, 2], image_grid_thw[:, 0]).cumsum(
1118
+ dim=0,
1119
+ # Select dtype based on the following factors:
1120
+ # - FA2 requires that cu_seqlens_q must have dtype int32
1121
+ # - torch.onnx.export requires that cu_seqlens_q must have same dtype as grid_thw
1122
+ # See https://github.com/huggingface/transformers/pull/34852 for more information
1123
+ dtype=image_grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
1124
+ )
1125
+ cu_seqlens = torch.nn.functional.pad(cu_seqlens, (1, 0), value=0)
1126
+ vision_outputs = self.visual(
1127
+ pixel_values=pixel_values,
1128
+ image_grid_thw=image_grid_thw,
1129
+ cu_seqlens=cu_seqlens,
1130
+ )
1131
+ image_embeds = vision_outputs.last_hidden_state
1132
+ image_embeds = self.projector(image_embeds, image_grid_thw)
1133
+ return image_embeds
1134
+
1135
+ def get_placeholder_mask(
1136
+ self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor
1137
+ ):
1138
+ """
1139
+ Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
1140
+ equal to the length of multimodal features. If the lengths are different, an error is raised.
1141
+ """
1142
+ if input_ids is None:
1143
+ special_image_mask = inputs_embeds == self.get_input_embeddings()(
1144
+ torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
1145
+ )
1146
+ special_image_mask = special_image_mask.all(-1)
1147
+ else:
1148
+ special_image_mask = input_ids == self.config.image_token_id
1149
+
1150
+ n_image_tokens = special_image_mask.sum()
1151
+ special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
1152
+ n_image_features = image_features.shape[0] * image_features.shape[1]
1153
+ if inputs_embeds[special_image_mask].numel() != image_features.numel():
1154
+ raise ValueError(
1155
+ f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
1156
+ )
1157
+ return special_image_mask
1158
+
1159
+ @can_return_tuple
1160
+ def forward(
1161
+ self,
1162
+ input_ids: torch.LongTensor = None,
1163
+ attention_mask: Optional[torch.Tensor] = None,
1164
+ position_ids: Optional[torch.LongTensor] = None,
1165
+ past_key_values: Optional[list[torch.FloatTensor]] = None,
1166
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1167
+ use_cache: Optional[bool] = None,
1168
+ pixel_values: Optional[torch.Tensor] = None,
1169
+ image_grid_thw: Optional[torch.LongTensor] = None,
1170
+ rope_deltas: Optional[torch.LongTensor] = None,
1171
+ cache_position: Optional[torch.LongTensor] = None,
1172
+ **kwargs,
1173
+ ) -> Union[tuple, PaddleOCRVLModelOutputWithPast]:
1174
+ r"""
1175
+ image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
1176
+ The temporal, height and width of feature shape of each image in LLM.
1177
+ rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
1178
+ The rope index difference between sequence length and multimodal rope.
1179
+ """
1180
+ if inputs_embeds is None:
1181
+ inputs_embeds = self.language_model.embed_tokens(input_ids)
1182
+
1183
+ if pixel_values is not None:
1184
+ image_embeds = self.get_image_features(pixel_values, image_grid_thw).to(
1185
+ inputs_embeds.device, inputs_embeds.dtype
1186
+ )
1187
+ image_mask = self.get_placeholder_mask(input_ids, inputs_embeds=inputs_embeds, image_features=image_embeds)
1188
+ inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
1189
+
1190
+ if position_ids is None:
1191
+ past_key_values_length = 0 if past_key_values is None else past_key_values.get_seq_length()
1192
+ if self.rope_deltas is None or past_key_values_length == 0:
1193
+ position_ids, rope_deltas = self.get_rope_index(
1194
+ input_ids=input_ids,
1195
+ image_grid_thw=image_grid_thw,
1196
+ attention_mask=attention_mask,
1197
+ )
1198
+ self.rope_deltas = rope_deltas
1199
+ # then use the prev pre-calculated rope-deltas to get the correct position ids
1200
+ else:
1201
+ batch_size, seq_length, _ = inputs_embeds.shape
1202
+ position_ids = torch.arange(seq_length, device=inputs_embeds.device)
1203
+ position_ids = position_ids.view(1, 1, -1).expand(3, batch_size, -1)
1204
+ delta = (past_key_values_length + self.rope_deltas).to(inputs_embeds.device)
1205
+ delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0)
1206
+ position_ids = position_ids + delta.to(position_ids.device)
1207
+
1208
+ outputs = self.language_model(
1209
+ input_ids=None,
1210
+ position_ids=position_ids,
1211
+ attention_mask=attention_mask,
1212
+ past_key_values=past_key_values,
1213
+ inputs_embeds=inputs_embeds,
1214
+ use_cache=use_cache,
1215
+ cache_position=cache_position,
1216
+ **kwargs,
1217
+ )
1218
+
1219
+ output = PaddleOCRVLModelOutputWithPast(
1220
+ last_hidden_state=outputs.last_hidden_state,
1221
+ past_key_values=outputs.past_key_values,
1222
+ hidden_states=outputs.hidden_states,
1223
+ attentions=outputs.attentions,
1224
+ rope_deltas=self.rope_deltas,
1225
+ )
1226
+
1227
+ return output
1228
+
1229
+
1230
+ class PaddleOCRVLForConditionalGeneration(Qwen2VLForConditionalGeneration):
1231
+ _checkpoint_conversion_mapping = {
1232
+ "^visual": "model.visual",
1233
+ "^mlp_AR": "model.projector",
1234
+ r"^model(?!(\.visual|\.projector|\.language_model))": "model.language_model",
1235
+ }
1236
+ _keys_to_ignore_on_load_unexpected = ["packing_position_embedding", "vision_model.head"]
1237
+
1238
+ @can_return_tuple
1239
+ @auto_docstring
1240
+ def forward(
1241
+ self,
1242
+ input_ids: Optional[torch.LongTensor] = None,
1243
+ attention_mask: Optional[torch.Tensor] = None,
1244
+ position_ids: Optional[torch.LongTensor] = None,
1245
+ past_key_values: Optional[Cache] = None,
1246
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1247
+ labels: Optional[torch.LongTensor] = None,
1248
+ use_cache: Optional[bool] = None,
1249
+ pixel_values: Optional[torch.Tensor] = None,
1250
+ image_grid_thw: Optional[torch.LongTensor] = None,
1251
+ rope_deltas: Optional[torch.LongTensor] = None,
1252
+ cache_position: Optional[torch.LongTensor] = None,
1253
+ logits_to_keep: Union[int, torch.Tensor] = 0,
1254
+ **kwargs: Unpack[TransformersKwargs],
1255
+ ) -> Union[tuple, PaddleOCRVLCausalLMOutputWithPast]:
1256
+ r"""
1257
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1258
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
1259
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
1260
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
1261
+ image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
1262
+ The temporal, height and width of feature shape of each image in LLM.
1263
+ rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
1264
+ The rope index difference between sequence length and multimodal rope.
1265
+
1266
+ Example:
1267
+
1268
+ ```python
1269
+ >>> from transformers import AutoProcessor, PaddleOCRVLForConditionalGeneration
1270
+
1271
+ >>> model = PaddleOCRVLForConditionalGeneration.from_pretrained("PaddlePaddle/PaddleOCR-VL", dtype="bfloat16")
1272
+ >>> processor = AutoProcessor.from_pretrained("PaddlePaddle/PaddleOCR-VL")
1273
+
1274
+ >>> messages = [
1275
+ {
1276
+ "role": "user",
1277
+ "content": [
1278
+ {
1279
+ "type": "image",
1280
+ "image": "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/ocr_demo.jpg",
1281
+ },
1282
+ {"type": "text", "text": "OCR:"},
1283
+ ],
1284
+ }
1285
+ ]
1286
+
1287
+ >>> inputs = processor.apply_chat_template(
1288
+ messages,
1289
+ tokenize=True,
1290
+ add_generation_prompt=True,
1291
+ return_dict=True,
1292
+ return_tensors="pt"
1293
+ ).to(model.device)
1294
+
1295
+ >>> # Generate
1296
+ >>> generated_ids = model.generate(**inputs, max_new_tokens=1024)
1297
+ >>> generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
1298
+ >>> output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
1299
+ >>> print(output_text)
1300
+ ```
1301
+ """
1302
+ outputs: PaddleOCRVLModelOutputWithPast = self.model(
1303
+ input_ids=input_ids,
1304
+ attention_mask=attention_mask,
1305
+ position_ids=position_ids,
1306
+ image_grid_thw=image_grid_thw,
1307
+ past_key_values=past_key_values,
1308
+ inputs_embeds=inputs_embeds,
1309
+ use_cache=use_cache,
1310
+ pixel_values=pixel_values,
1311
+ rope_deltas=rope_deltas,
1312
+ cache_position=cache_position,
1313
+ **kwargs,
1314
+ )
1315
+ hidden_states = outputs.last_hidden_state
1316
+
1317
+ slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
1318
+ logits = self.lm_head(hidden_states[:, slice_indices, :])
1319
+
1320
+ loss = None
1321
+ if labels is not None:
1322
+ loss = self.loss_function(
1323
+ logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
1324
+ )
1325
+
1326
+ return PaddleOCRVLCausalLMOutputWithPast(
1327
+ loss=loss,
1328
+ logits=logits,
1329
+ past_key_values=outputs.past_key_values,
1330
+ hidden_states=outputs.hidden_states,
1331
+ attentions=outputs.attentions,
1332
+ rope_deltas=outputs.rope_deltas,
1333
+ )
1334
+
1335
+
1336
+ __all__ = [
1337
+ "PaddleOCRVLForConditionalGeneration",
1338
+ "PaddleOCRVLModel",
1339
+ "PaddleOCRVLPreTrainedModel",
1340
+ "PaddleOCRVisionTransformer",
1341
+ "PaddleOCRVLConfig",
1342
+ "PaddleOCRTextModel",
1343
+ "PaddleOCRVisionModel",
1344
+ "PaddleOCRVisionConfig",
1345
+ "PaddleOCRTextConfig",
1346
+ "PaddleOCRVLImageProcessor",
1347
+ "PaddleOCRVLImageProcessorFast",
1348
+ "PaddleOCRVLProcessor",
1349
+ ]