transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (539) hide show
  1. transformers/__init__.py +30 -3
  2. transformers/cli/serve.py +47 -17
  3. transformers/conversion_mapping.py +15 -2
  4. transformers/convert_slow_tokenizer.py +225 -10
  5. transformers/core_model_loading.py +196 -135
  6. transformers/data/data_collator.py +12 -4
  7. transformers/dependency_versions_table.py +1 -2
  8. transformers/dynamic_module_utils.py +1 -2
  9. transformers/feature_extraction_utils.py +1 -2
  10. transformers/file_utils.py +0 -1
  11. transformers/generation/__init__.py +11 -1
  12. transformers/generation/configuration_utils.py +3 -2
  13. transformers/generation/continuous_batching/__init__.py +4 -0
  14. transformers/generation/continuous_batching/continuous_api.py +134 -79
  15. transformers/image_processing_base.py +1 -2
  16. transformers/integrations/__init__.py +4 -2
  17. transformers/integrations/accelerate.py +15 -3
  18. transformers/integrations/aqlm.py +38 -66
  19. transformers/integrations/awq.py +48 -514
  20. transformers/integrations/bitnet.py +45 -100
  21. transformers/integrations/bitsandbytes.py +79 -191
  22. transformers/integrations/deepspeed.py +1 -0
  23. transformers/integrations/eetq.py +84 -79
  24. transformers/integrations/fbgemm_fp8.py +191 -145
  25. transformers/integrations/finegrained_fp8.py +236 -193
  26. transformers/integrations/fp_quant.py +92 -0
  27. transformers/integrations/ggml.py +11 -1
  28. transformers/integrations/higgs.py +40 -62
  29. transformers/integrations/hub_kernels.py +42 -3
  30. transformers/integrations/integration_utils.py +10 -0
  31. transformers/integrations/mxfp4.py +25 -65
  32. transformers/integrations/peft.py +7 -29
  33. transformers/integrations/quanto.py +73 -55
  34. transformers/integrations/quark.py +55 -0
  35. transformers/integrations/spqr.py +44 -90
  36. transformers/integrations/torchao.py +32 -38
  37. transformers/integrations/vptq.py +42 -59
  38. transformers/modelcard.py +1 -2
  39. transformers/modeling_gguf_pytorch_utils.py +8 -0
  40. transformers/modeling_rope_utils.py +30 -6
  41. transformers/modeling_utils.py +116 -112
  42. transformers/models/__init__.py +3 -0
  43. transformers/models/afmoe/modeling_afmoe.py +4 -4
  44. transformers/models/albert/tokenization_albert.py +6 -12
  45. transformers/models/align/modeling_align.py +2 -0
  46. transformers/models/altclip/modeling_altclip.py +4 -0
  47. transformers/models/apertus/modeling_apertus.py +4 -4
  48. transformers/models/arcee/modeling_arcee.py +4 -4
  49. transformers/models/aria/modeling_aria.py +4 -4
  50. transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
  51. transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
  52. transformers/models/auto/configuration_auto.py +11 -0
  53. transformers/models/auto/feature_extraction_auto.py +2 -0
  54. transformers/models/auto/image_processing_auto.py +1 -0
  55. transformers/models/auto/modeling_auto.py +6 -0
  56. transformers/models/auto/processing_auto.py +18 -10
  57. transformers/models/auto/tokenization_auto.py +74 -472
  58. transformers/models/autoformer/modeling_autoformer.py +4 -0
  59. transformers/models/bamba/modeling_bamba.py +4 -3
  60. transformers/models/bark/modeling_bark.py +2 -0
  61. transformers/models/bart/modeling_bart.py +7 -0
  62. transformers/models/barthez/tokenization_barthez.py +5 -10
  63. transformers/models/beit/modeling_beit.py +6 -1
  64. transformers/models/bert/tokenization_bert.py +8 -21
  65. transformers/models/big_bird/modeling_big_bird.py +6 -0
  66. transformers/models/big_bird/tokenization_big_bird.py +18 -42
  67. transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +8 -2
  68. transformers/models/biogpt/modeling_biogpt.py +2 -0
  69. transformers/models/biogpt/modular_biogpt.py +2 -0
  70. transformers/models/bit/modeling_bit.py +11 -2
  71. transformers/models/bitnet/modeling_bitnet.py +4 -4
  72. transformers/models/blenderbot/modeling_blenderbot.py +5 -0
  73. transformers/models/blenderbot/tokenization_blenderbot.py +12 -16
  74. transformers/models/blenderbot_small/modeling_blenderbot_small.py +5 -0
  75. transformers/models/blip/modeling_blip_text.py +2 -0
  76. transformers/models/blip_2/modeling_blip_2.py +2 -1
  77. transformers/models/bloom/modeling_bloom.py +4 -0
  78. transformers/models/blt/modeling_blt.py +2 -2
  79. transformers/models/blt/modular_blt.py +2 -2
  80. transformers/models/bridgetower/modeling_bridgetower.py +5 -1
  81. transformers/models/bros/modeling_bros.py +4 -0
  82. transformers/models/camembert/tokenization_camembert.py +8 -12
  83. transformers/models/canine/modeling_canine.py +5 -0
  84. transformers/models/chameleon/modeling_chameleon.py +2 -1
  85. transformers/models/chinese_clip/modeling_chinese_clip.py +3 -0
  86. transformers/models/clap/modeling_clap.py +5 -0
  87. transformers/models/clip/tokenization_clip.py +22 -44
  88. transformers/models/clipseg/modeling_clipseg.py +5 -0
  89. transformers/models/clvp/modeling_clvp.py +5 -0
  90. transformers/models/clvp/tokenization_clvp.py +1 -63
  91. transformers/models/code_llama/tokenization_code_llama.py +20 -43
  92. transformers/models/codegen/tokenization_codegen.py +14 -43
  93. transformers/models/cohere/modeling_cohere.py +4 -3
  94. transformers/models/cohere/modular_cohere.py +2 -1
  95. transformers/models/cohere/tokenization_cohere.py +12 -42
  96. transformers/models/cohere2/modeling_cohere2.py +7 -6
  97. transformers/models/cohere2/modular_cohere2.py +5 -5
  98. transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -3
  99. transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
  100. transformers/models/colqwen2/modeling_colqwen2.py +1 -0
  101. transformers/models/colqwen2/modular_colqwen2.py +1 -0
  102. transformers/models/conditional_detr/modeling_conditional_detr.py +5 -0
  103. transformers/models/convbert/modeling_convbert.py +6 -0
  104. transformers/models/convnext/modeling_convnext.py +2 -4
  105. transformers/models/convnextv2/modeling_convnextv2.py +2 -4
  106. transformers/models/csm/modeling_csm.py +4 -3
  107. transformers/models/ctrl/modeling_ctrl.py +1 -0
  108. transformers/models/cvt/modeling_cvt.py +2 -0
  109. transformers/models/cwm/modeling_cwm.py +4 -4
  110. transformers/models/d_fine/modeling_d_fine.py +2 -0
  111. transformers/models/d_fine/modular_d_fine.py +1 -0
  112. transformers/models/dab_detr/modeling_dab_detr.py +4 -0
  113. transformers/models/dac/modeling_dac.py +2 -2
  114. transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
  115. transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
  116. transformers/models/dbrx/modeling_dbrx.py +2 -2
  117. transformers/models/deberta/modeling_deberta.py +5 -0
  118. transformers/models/deberta/tokenization_deberta.py +11 -20
  119. transformers/models/deberta_v2/modeling_deberta_v2.py +6 -0
  120. transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
  121. transformers/models/decision_transformer/modeling_decision_transformer.py +4 -1
  122. transformers/models/deepseek_v2/modeling_deepseek_v2.py +2 -3
  123. transformers/models/deepseek_v2/modular_deepseek_v2.py +2 -2
  124. transformers/models/deepseek_v3/modeling_deepseek_v3.py +3 -2
  125. transformers/models/deepseek_v3/modular_deepseek_v3.py +1 -0
  126. transformers/models/deformable_detr/modeling_deformable_detr.py +4 -0
  127. transformers/models/depth_anything/modeling_depth_anything.py +1 -0
  128. transformers/models/depth_pro/modeling_depth_pro.py +2 -0
  129. transformers/models/detr/modeling_detr.py +5 -0
  130. transformers/models/dia/modeling_dia.py +4 -3
  131. transformers/models/dia/modular_dia.py +0 -1
  132. transformers/models/diffllama/modeling_diffllama.py +2 -2
  133. transformers/models/dinat/modeling_dinat.py +3 -0
  134. transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
  135. transformers/models/dinov3_vit/modeling_dinov3_vit.py +2 -2
  136. transformers/models/dinov3_vit/modular_dinov3_vit.py +2 -2
  137. transformers/models/distilbert/tokenization_distilbert.py +13 -0
  138. transformers/models/doge/modeling_doge.py +2 -3
  139. transformers/models/doge/modular_doge.py +0 -1
  140. transformers/models/donut/modeling_donut_swin.py +2 -0
  141. transformers/models/dots1/modeling_dots1.py +10 -7
  142. transformers/models/dots1/modular_dots1.py +5 -3
  143. transformers/models/dpr/modeling_dpr.py +5 -0
  144. transformers/models/dpr/tokenization_dpr.py +12 -0
  145. transformers/models/edgetam/modeling_edgetam.py +1 -1
  146. transformers/models/edgetam_video/modeling_edgetam_video.py +1 -0
  147. transformers/models/edgetam_video/modular_edgetam_video.py +1 -0
  148. transformers/models/efficientloftr/modeling_efficientloftr.py +2 -2
  149. transformers/models/efficientnet/modeling_efficientnet.py +2 -0
  150. transformers/models/emu3/modeling_emu3.py +4 -4
  151. transformers/models/eomt/image_processing_eomt.py +13 -1
  152. transformers/models/eomt/image_processing_eomt_fast.py +14 -2
  153. transformers/models/ernie4_5/modeling_ernie4_5.py +4 -4
  154. transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
  155. transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +5 -5
  156. transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +2 -2
  157. transformers/models/esm/modeling_esmfold.py +5 -4
  158. transformers/models/evolla/modeling_evolla.py +4 -4
  159. transformers/models/exaone4/modeling_exaone4.py +2 -2
  160. transformers/models/exaone4/modular_exaone4.py +0 -1
  161. transformers/models/falcon/modeling_falcon.py +6 -1
  162. transformers/models/falcon_h1/modeling_falcon_h1.py +4 -3
  163. transformers/models/falcon_mamba/modeling_falcon_mamba.py +25 -35
  164. transformers/models/falcon_mamba/modular_falcon_mamba.py +12 -31
  165. transformers/{kernels/falcon_mamba → models/fast_vlm}/__init__.py +15 -3
  166. transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
  167. transformers/models/fast_vlm/modeling_fast_vlm.py +455 -0
  168. transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
  169. transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +8 -3
  170. transformers/models/flaubert/modeling_flaubert.py +7 -0
  171. transformers/models/flava/modeling_flava.py +6 -1
  172. transformers/models/flex_olmo/modeling_flex_olmo.py +4 -5
  173. transformers/models/florence2/modeling_florence2.py +2 -1
  174. transformers/models/florence2/modular_florence2.py +2 -1
  175. transformers/models/fnet/modeling_fnet.py +7 -0
  176. transformers/models/focalnet/modeling_focalnet.py +4 -0
  177. transformers/models/fsmt/modeling_fsmt.py +2 -0
  178. transformers/models/funnel/modeling_funnel.py +8 -0
  179. transformers/models/funnel/tokenization_funnel.py +17 -24
  180. transformers/models/fuyu/processing_fuyu.py +3 -3
  181. transformers/models/gemma/modeling_gemma.py +4 -4
  182. transformers/models/gemma/tokenization_gemma.py +10 -27
  183. transformers/models/gemma2/modeling_gemma2.py +4 -4
  184. transformers/models/gemma2/modular_gemma2.py +2 -1
  185. transformers/models/gemma3/modeling_gemma3.py +14 -84
  186. transformers/models/gemma3/modular_gemma3.py +12 -81
  187. transformers/models/gemma3n/modeling_gemma3n.py +18 -209
  188. transformers/models/gemma3n/modular_gemma3n.py +17 -59
  189. transformers/models/git/modeling_git.py +2 -0
  190. transformers/models/glm/modeling_glm.py +4 -4
  191. transformers/models/glm4/modeling_glm4.py +4 -4
  192. transformers/models/glm4_moe/modeling_glm4_moe.py +5 -3
  193. transformers/models/glm4v/configuration_glm4v.py +3 -1
  194. transformers/models/glm4v/modeling_glm4v.py +3 -3
  195. transformers/models/glm4v/modular_glm4v.py +6 -4
  196. transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
  197. transformers/models/glm4v_moe/modeling_glm4v_moe.py +6 -5
  198. transformers/models/glm4v_moe/modular_glm4v_moe.py +1 -1
  199. transformers/models/glpn/modeling_glpn.py +2 -0
  200. transformers/models/gpt2/modeling_gpt2.py +5 -1
  201. transformers/models/gpt2/tokenization_gpt2.py +16 -44
  202. transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +1 -0
  203. transformers/models/gpt_neo/modeling_gpt_neo.py +4 -0
  204. transformers/models/gpt_neox/modeling_gpt_neox.py +5 -2
  205. transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
  206. transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
  207. transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +3 -1
  208. transformers/models/gpt_oss/modeling_gpt_oss.py +5 -6
  209. transformers/models/gpt_oss/modular_gpt_oss.py +3 -5
  210. transformers/models/gptj/modeling_gptj.py +3 -0
  211. transformers/models/granite/modeling_granite.py +4 -4
  212. transformers/models/granitemoe/modeling_granitemoe.py +4 -6
  213. transformers/models/granitemoe/modular_granitemoe.py +0 -2
  214. transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +4 -6
  215. transformers/models/granitemoeshared/modeling_granitemoeshared.py +4 -6
  216. transformers/models/grounding_dino/modeling_grounding_dino.py +4 -0
  217. transformers/models/groupvit/modeling_groupvit.py +3 -0
  218. transformers/models/helium/modeling_helium.py +4 -3
  219. transformers/models/herbert/tokenization_herbert.py +9 -25
  220. transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -1
  221. transformers/models/hgnet_v2/modular_hgnet_v2.py +6 -1
  222. transformers/models/hiera/modeling_hiera.py +4 -0
  223. transformers/models/hubert/modeling_hubert.py +3 -0
  224. transformers/models/hubert/modular_hubert.py +1 -0
  225. transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +4 -4
  226. transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +4 -4
  227. transformers/models/ibert/modeling_ibert.py +6 -0
  228. transformers/models/idefics/modeling_idefics.py +5 -21
  229. transformers/models/imagegpt/modeling_imagegpt.py +2 -1
  230. transformers/models/informer/modeling_informer.py +4 -0
  231. transformers/models/informer/modular_informer.py +1 -0
  232. transformers/models/internvl/modeling_internvl.py +2 -4
  233. transformers/models/internvl/modular_internvl.py +2 -4
  234. transformers/models/jamba/modeling_jamba.py +2 -2
  235. transformers/models/janus/modeling_janus.py +1 -0
  236. transformers/models/janus/modular_janus.py +1 -0
  237. transformers/models/jetmoe/modeling_jetmoe.py +2 -2
  238. transformers/models/kosmos2/modeling_kosmos2.py +1 -0
  239. transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +3 -1
  240. transformers/models/lasr/__init__.py +29 -0
  241. transformers/models/lasr/configuration_lasr.py +244 -0
  242. transformers/models/lasr/feature_extraction_lasr.py +277 -0
  243. transformers/models/lasr/modeling_lasr.py +729 -0
  244. transformers/models/lasr/modular_lasr.py +569 -0
  245. transformers/models/lasr/processing_lasr.py +96 -0
  246. transformers/models/lasr/tokenization_lasr.py +186 -0
  247. transformers/models/layoutlm/modeling_layoutlm.py +5 -0
  248. transformers/models/layoutlmv2/modeling_layoutlmv2.py +4 -0
  249. transformers/models/layoutlmv2/tokenization_layoutlmv2.py +10 -53
  250. transformers/models/layoutlmv3/modeling_layoutlmv3.py +4 -0
  251. transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
  252. transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
  253. transformers/models/led/modeling_led.py +6 -0
  254. transformers/models/levit/modeling_levit.py +3 -0
  255. transformers/models/lfm2/modeling_lfm2.py +4 -5
  256. transformers/models/lfm2/modular_lfm2.py +0 -1
  257. transformers/models/lfm2_moe/modeling_lfm2_moe.py +4 -5
  258. transformers/models/lightglue/modeling_lightglue.py +3 -1
  259. transformers/models/lightglue/modular_lightglue.py +1 -0
  260. transformers/models/lilt/modeling_lilt.py +4 -0
  261. transformers/models/llama/modeling_llama.py +4 -4
  262. transformers/models/llama/tokenization_llama.py +15 -43
  263. transformers/models/llama4/modeling_llama4.py +3 -2
  264. transformers/models/longcat_flash/modeling_longcat_flash.py +4 -4
  265. transformers/models/longcat_flash/modular_longcat_flash.py +2 -2
  266. transformers/models/longformer/modeling_longformer.py +6 -0
  267. transformers/models/longt5/modeling_longt5.py +4 -0
  268. transformers/models/luke/modeling_luke.py +9 -0
  269. transformers/models/luke/tokenization_luke.py +11 -38
  270. transformers/models/lxmert/modeling_lxmert.py +2 -0
  271. transformers/models/m2m_100/modeling_m2m_100.py +4 -0
  272. transformers/models/mamba/modeling_mamba.py +14 -22
  273. transformers/models/marian/modeling_marian.py +5 -0
  274. transformers/models/markuplm/modeling_markuplm.py +4 -0
  275. transformers/models/markuplm/tokenization_markuplm.py +28 -61
  276. transformers/models/mask2former/modeling_mask2former.py +2 -0
  277. transformers/models/maskformer/modeling_maskformer.py +2 -0
  278. transformers/models/maskformer/modeling_maskformer_swin.py +2 -0
  279. transformers/models/mbart/modeling_mbart.py +7 -0
  280. transformers/models/mbart/tokenization_mbart.py +11 -52
  281. transformers/models/mbart50/tokenization_mbart50.py +7 -10
  282. transformers/models/megatron_bert/modeling_megatron_bert.py +7 -0
  283. transformers/models/mgp_str/modeling_mgp_str.py +2 -0
  284. transformers/models/mimi/modeling_mimi.py +3 -1
  285. transformers/models/minimax/modeling_minimax.py +4 -4
  286. transformers/models/ministral/modeling_ministral.py +4 -4
  287. transformers/models/ministral3/configuration_ministral3.py +1 -1
  288. transformers/models/ministral3/modeling_ministral3.py +4 -3
  289. transformers/models/mistral/modeling_mistral.py +4 -3
  290. transformers/models/mixtral/modeling_mixtral.py +4 -4
  291. transformers/models/mllama/modeling_mllama.py +2 -2
  292. transformers/models/mluke/tokenization_mluke.py +6 -6
  293. transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +4 -0
  294. transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
  295. transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
  296. transformers/models/mobilevit/modeling_mobilevit.py +3 -0
  297. transformers/models/mobilevitv2/modeling_mobilevitv2.py +3 -0
  298. transformers/models/modernbert/modeling_modernbert.py +4 -1
  299. transformers/models/modernbert/modular_modernbert.py +2 -0
  300. transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +8 -9
  301. transformers/models/modernbert_decoder/modular_modernbert_decoder.py +6 -7
  302. transformers/models/moonshine/modeling_moonshine.py +4 -2
  303. transformers/models/moshi/modeling_moshi.py +5 -2
  304. transformers/models/mpnet/modeling_mpnet.py +5 -0
  305. transformers/models/mpnet/tokenization_mpnet.py +5 -13
  306. transformers/models/mpt/modeling_mpt.py +2 -0
  307. transformers/models/mra/modeling_mra.py +6 -0
  308. transformers/models/mt5/modeling_mt5.py +7 -0
  309. transformers/models/musicgen/modeling_musicgen.py +2 -0
  310. transformers/models/musicgen_melody/modeling_musicgen_melody.py +3 -0
  311. transformers/models/mvp/modeling_mvp.py +7 -0
  312. transformers/models/nanochat/modeling_nanochat.py +4 -4
  313. transformers/models/nemotron/modeling_nemotron.py +4 -2
  314. transformers/models/nllb/tokenization_nllb.py +8 -22
  315. transformers/models/nougat/tokenization_nougat.py +11 -59
  316. transformers/models/nystromformer/modeling_nystromformer.py +6 -0
  317. transformers/models/olmo/modeling_olmo.py +4 -4
  318. transformers/models/olmo/modular_olmo.py +2 -2
  319. transformers/models/olmo2/modeling_olmo2.py +4 -5
  320. transformers/models/olmo2/modular_olmo2.py +0 -1
  321. transformers/models/olmo3/modeling_olmo3.py +4 -4
  322. transformers/models/olmoe/modeling_olmoe.py +4 -4
  323. transformers/models/omdet_turbo/modeling_omdet_turbo.py +2 -0
  324. transformers/models/oneformer/modeling_oneformer.py +4 -1
  325. transformers/models/openai/modeling_openai.py +3 -0
  326. transformers/models/openai/tokenization_openai.py +10 -46
  327. transformers/models/opt/modeling_opt.py +2 -0
  328. transformers/models/owlv2/modeling_owlv2.py +4 -0
  329. transformers/models/owlvit/modeling_owlvit.py +4 -0
  330. transformers/models/paddleocr_vl/__init__.py +32 -0
  331. transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
  332. transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +503 -0
  333. transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
  334. transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1668 -0
  335. transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1349 -0
  336. transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
  337. transformers/models/parakeet/configuration_parakeet.py +4 -6
  338. transformers/models/parakeet/modeling_parakeet.py +9 -6
  339. transformers/models/parakeet/modular_parakeet.py +2 -2
  340. transformers/models/parakeet/processing_parakeet.py +1 -0
  341. transformers/models/patchtsmixer/modeling_patchtsmixer.py +6 -0
  342. transformers/models/patchtst/modeling_patchtst.py +20 -2
  343. transformers/models/pegasus/modeling_pegasus.py +5 -0
  344. transformers/models/pegasus/tokenization_pegasus.py +17 -44
  345. transformers/models/pegasus_x/modeling_pegasus_x.py +4 -0
  346. transformers/models/perceiver/modeling_perceiver.py +8 -0
  347. transformers/models/persimmon/modeling_persimmon.py +2 -1
  348. transformers/models/phi/modeling_phi.py +4 -5
  349. transformers/models/phi/modular_phi.py +0 -1
  350. transformers/models/phi3/modeling_phi3.py +2 -1
  351. transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +5 -5
  352. transformers/models/phi4_multimodal/modular_phi4_multimodal.py +4 -4
  353. transformers/models/phimoe/modeling_phimoe.py +4 -4
  354. transformers/models/phimoe/modular_phimoe.py +2 -2
  355. transformers/models/pix2struct/modeling_pix2struct.py +2 -0
  356. transformers/models/pixtral/modeling_pixtral.py +2 -1
  357. transformers/models/plbart/modeling_plbart.py +6 -0
  358. transformers/models/plbart/modular_plbart.py +2 -0
  359. transformers/models/plbart/tokenization_plbart.py +0 -2
  360. transformers/models/poolformer/modeling_poolformer.py +2 -0
  361. transformers/models/pop2piano/modeling_pop2piano.py +2 -0
  362. transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
  363. transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
  364. transformers/models/prophetnet/modeling_prophetnet.py +3 -0
  365. transformers/models/pvt/modeling_pvt.py +2 -0
  366. transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
  367. transformers/models/qwen2/modeling_qwen2.py +4 -4
  368. transformers/models/qwen2/tokenization_qwen2.py +14 -18
  369. transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
  370. transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +13 -16
  371. transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +14 -16
  372. transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
  373. transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +5 -6
  374. transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +3 -5
  375. transformers/models/qwen2_audio/modeling_qwen2_audio.py +2 -0
  376. transformers/models/qwen2_moe/modeling_qwen2_moe.py +4 -4
  377. transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
  378. transformers/models/qwen2_vl/modeling_qwen2_vl.py +6 -16
  379. transformers/models/qwen3/modeling_qwen3.py +4 -4
  380. transformers/models/qwen3_moe/modeling_qwen3_moe.py +4 -4
  381. transformers/models/qwen3_next/modeling_qwen3_next.py +4 -3
  382. transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +21 -23
  383. transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +14 -16
  384. transformers/models/qwen3_vl/modeling_qwen3_vl.py +39 -37
  385. transformers/models/qwen3_vl/modular_qwen3_vl.py +37 -35
  386. transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +39 -37
  387. transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +4 -1
  388. transformers/models/rag/modeling_rag.py +1 -0
  389. transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +15 -1
  390. transformers/models/reformer/modeling_reformer.py +4 -0
  391. transformers/models/reformer/tokenization_reformer.py +11 -28
  392. transformers/models/regnet/modeling_regnet.py +6 -1
  393. transformers/models/rembert/modeling_rembert.py +6 -0
  394. transformers/models/rembert/tokenization_rembert.py +3 -10
  395. transformers/models/resnet/modeling_resnet.py +11 -2
  396. transformers/models/roberta/tokenization_roberta.py +18 -27
  397. transformers/models/roformer/modeling_roformer.py +6 -0
  398. transformers/models/roformer/tokenization_roformer.py +77 -412
  399. transformers/models/rt_detr/modeling_rt_detr.py +2 -0
  400. transformers/models/rt_detr/modeling_rt_detr_resnet.py +5 -1
  401. transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +2 -0
  402. transformers/models/rwkv/modeling_rwkv.py +1 -0
  403. transformers/models/sam2/modeling_sam2.py +2 -2
  404. transformers/models/sam2/modular_sam2.py +2 -2
  405. transformers/models/sam2_video/modeling_sam2_video.py +1 -0
  406. transformers/models/sam2_video/modular_sam2_video.py +1 -0
  407. transformers/models/sam3/modeling_sam3.py +77 -80
  408. transformers/models/sam3_tracker/modeling_sam3_tracker.py +6 -1
  409. transformers/models/sam3_tracker/modular_sam3_tracker.py +6 -1
  410. transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +1 -0
  411. transformers/models/sam3_video/modeling_sam3_video.py +1 -0
  412. transformers/models/seamless_m4t/modeling_seamless_m4t.py +5 -1
  413. transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
  414. transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +5 -1
  415. transformers/models/seed_oss/modeling_seed_oss.py +2 -2
  416. transformers/models/segformer/modeling_segformer.py +4 -1
  417. transformers/models/seggpt/modeling_seggpt.py +2 -0
  418. transformers/models/sew/modeling_sew.py +3 -0
  419. transformers/models/sew/modular_sew.py +1 -0
  420. transformers/models/sew_d/modeling_sew_d.py +3 -0
  421. transformers/models/siglip2/modeling_siglip2.py +4 -0
  422. transformers/models/siglip2/modular_siglip2.py +4 -0
  423. transformers/models/smollm3/modeling_smollm3.py +4 -4
  424. transformers/models/smolvlm/processing_smolvlm.py +0 -7
  425. transformers/models/speech_to_text/modeling_speech_to_text.py +4 -0
  426. transformers/models/speecht5/modeling_speecht5.py +13 -1
  427. transformers/models/splinter/modeling_splinter.py +3 -0
  428. transformers/models/splinter/tokenization_splinter.py +9 -28
  429. transformers/models/squeezebert/modeling_squeezebert.py +6 -0
  430. transformers/models/stablelm/modeling_stablelm.py +3 -1
  431. transformers/models/starcoder2/modeling_starcoder2.py +4 -3
  432. transformers/models/superglue/modeling_superglue.py +1 -0
  433. transformers/models/superpoint/modeling_superpoint.py +1 -0
  434. transformers/models/swiftformer/modeling_swiftformer.py +2 -0
  435. transformers/models/swin/modeling_swin.py +4 -0
  436. transformers/models/swin2sr/modeling_swin2sr.py +2 -0
  437. transformers/models/swinv2/modeling_swinv2.py +4 -0
  438. transformers/models/t5/modeling_t5.py +7 -0
  439. transformers/models/t5/tokenization_t5.py +4 -8
  440. transformers/models/t5gemma/modeling_t5gemma.py +5 -5
  441. transformers/models/t5gemma2/modeling_t5gemma2.py +6 -6
  442. transformers/models/table_transformer/modeling_table_transformer.py +4 -0
  443. transformers/models/tapas/modeling_tapas.py +3 -0
  444. transformers/models/textnet/modeling_textnet.py +11 -2
  445. transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
  446. transformers/models/timesfm/modeling_timesfm.py +2 -0
  447. transformers/models/timesfm/modular_timesfm.py +2 -0
  448. transformers/models/timesformer/modeling_timesformer.py +2 -0
  449. transformers/models/timm_wrapper/modeling_timm_wrapper.py +1 -1
  450. transformers/models/trocr/modeling_trocr.py +2 -0
  451. transformers/models/tvp/modeling_tvp.py +2 -0
  452. transformers/models/udop/modeling_udop.py +4 -0
  453. transformers/models/udop/tokenization_udop.py +5 -13
  454. transformers/models/umt5/modeling_umt5.py +7 -0
  455. transformers/models/unispeech/modeling_unispeech.py +4 -0
  456. transformers/models/unispeech/modular_unispeech.py +2 -0
  457. transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
  458. transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
  459. transformers/models/univnet/modeling_univnet.py +1 -0
  460. transformers/models/upernet/modeling_upernet.py +1 -0
  461. transformers/models/vaultgemma/modeling_vaultgemma.py +4 -4
  462. transformers/models/vilt/modeling_vilt.py +6 -0
  463. transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
  464. transformers/models/visual_bert/modeling_visual_bert.py +6 -0
  465. transformers/models/vitdet/modeling_vitdet.py +2 -0
  466. transformers/models/vitmatte/modeling_vitmatte.py +1 -0
  467. transformers/models/vits/modeling_vits.py +1 -0
  468. transformers/models/vjepa2/modeling_vjepa2.py +1 -0
  469. transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
  470. transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +5 -0
  471. transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +5 -0
  472. transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +6 -0
  473. transformers/models/wavlm/modeling_wavlm.py +5 -0
  474. transformers/models/whisper/modeling_whisper.py +6 -0
  475. transformers/models/whisper/tokenization_whisper.py +4 -15
  476. transformers/models/x_clip/modeling_x_clip.py +3 -0
  477. transformers/models/xglm/modeling_xglm.py +1 -0
  478. transformers/models/xglm/tokenization_xglm.py +4 -9
  479. transformers/models/xlm/modeling_xlm.py +5 -0
  480. transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
  481. transformers/models/xlnet/tokenization_xlnet.py +3 -7
  482. transformers/models/yoso/modeling_yoso.py +6 -0
  483. transformers/models/zamba/modeling_zamba.py +2 -0
  484. transformers/models/zamba2/modeling_zamba2.py +4 -2
  485. transformers/models/zamba2/modular_zamba2.py +1 -1
  486. transformers/models/zoedepth/modeling_zoedepth.py +1 -0
  487. transformers/pipelines/__init__.py +2 -3
  488. transformers/pipelines/base.py +1 -9
  489. transformers/pipelines/document_question_answering.py +3 -1
  490. transformers/pipelines/text_generation.py +1 -1
  491. transformers/processing_utils.py +23 -11
  492. transformers/quantizers/base.py +35 -110
  493. transformers/quantizers/quantizer_aqlm.py +1 -5
  494. transformers/quantizers/quantizer_auto_round.py +1 -2
  495. transformers/quantizers/quantizer_awq.py +17 -81
  496. transformers/quantizers/quantizer_bitnet.py +3 -8
  497. transformers/quantizers/quantizer_bnb_4bit.py +13 -110
  498. transformers/quantizers/quantizer_bnb_8bit.py +16 -92
  499. transformers/quantizers/quantizer_compressed_tensors.py +1 -5
  500. transformers/quantizers/quantizer_eetq.py +14 -62
  501. transformers/quantizers/quantizer_fbgemm_fp8.py +34 -125
  502. transformers/quantizers/quantizer_finegrained_fp8.py +13 -105
  503. transformers/quantizers/quantizer_fp_quant.py +48 -78
  504. transformers/quantizers/quantizer_gptq.py +7 -24
  505. transformers/quantizers/quantizer_higgs.py +40 -54
  506. transformers/quantizers/quantizer_hqq.py +144 -153
  507. transformers/quantizers/quantizer_mxfp4.py +13 -167
  508. transformers/quantizers/quantizer_quanto.py +20 -64
  509. transformers/quantizers/quantizer_quark.py +36 -17
  510. transformers/quantizers/quantizer_spqr.py +1 -4
  511. transformers/quantizers/quantizer_torchao.py +23 -202
  512. transformers/quantizers/quantizer_vptq.py +8 -22
  513. transformers/quantizers/quantizers_utils.py +20 -0
  514. transformers/testing_utils.py +297 -36
  515. transformers/tokenization_mistral_common.py +4 -0
  516. transformers/tokenization_utils_base.py +113 -222
  517. transformers/tokenization_utils_tokenizers.py +168 -107
  518. transformers/trainer.py +28 -31
  519. transformers/trainer_jit_checkpoint.py +126 -0
  520. transformers/trainer_utils.py +1 -1
  521. transformers/training_args.py +66 -28
  522. transformers/utils/__init__.py +3 -4
  523. transformers/utils/auto_docstring.py +1 -0
  524. transformers/utils/generic.py +27 -1
  525. transformers/utils/hub.py +5 -15
  526. transformers/utils/import_utils.py +61 -16
  527. transformers/utils/kernel_config.py +4 -2
  528. transformers/utils/loading_report.py +19 -10
  529. transformers/utils/quantization_config.py +75 -242
  530. transformers/video_processing_utils.py +1 -2
  531. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/METADATA +274 -227
  532. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/RECORD +536 -520
  533. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/WHEEL +1 -1
  534. transformers/kernels/__init__.py +0 -0
  535. transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
  536. transformers/models/roformer/tokenization_roformer_fast.py +0 -160
  537. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/entry_points.txt +0 -0
  538. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info/licenses}/LICENSE +0 -0
  539. {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,503 @@
1
+ # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
2
+ # This file was automatically generated from src/transformers/models/paddleocr_vl/modular_paddleocr_vl.py.
3
+ # Do NOT edit this file manually as any edits will be overwritten by the generation of
4
+ # the file from the modular. If any change should be done, please apply the change to the
5
+ # modular_paddleocr_vl.py file directly. One of our CI enforces this.
6
+ # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
7
+ # Copyright 2025 The PaddlePaddle Team and The HuggingFace Inc. team. All rights reserved.
8
+ #
9
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
10
+ # and OPT implementations in this library. It has been modified from its
11
+ # original forms to accommodate minor architectural differences compared
12
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
13
+ #
14
+ # Licensed under the Apache License, Version 2.0 (the "License");
15
+ # you may not use this file except in compliance with the License.
16
+ # You may obtain a copy of the License at
17
+ #
18
+ # http://www.apache.org/licenses/LICENSE-2.0
19
+ #
20
+ # Unless required by applicable law or agreed to in writing, software
21
+ # distributed under the License is distributed on an "AS IS" BASIS,
22
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
23
+ # See the License for the specific language governing permissions and
24
+ # limitations under the License.
25
+
26
+ import math
27
+ from typing import Optional, Union
28
+
29
+ import numpy as np
30
+
31
+ from ...image_processing_utils import BaseImageProcessor, BatchFeature
32
+ from ...image_transforms import convert_to_rgb, resize, to_channel_dimension_format
33
+ from ...image_utils import (
34
+ OPENAI_CLIP_MEAN,
35
+ OPENAI_CLIP_STD,
36
+ ChannelDimension,
37
+ ImageInput,
38
+ PILImageResampling,
39
+ get_image_size,
40
+ infer_channel_dimension_format,
41
+ is_scaled_image,
42
+ make_flat_list_of_images,
43
+ make_list_of_images,
44
+ to_numpy_array,
45
+ valid_images,
46
+ validate_preprocess_arguments,
47
+ )
48
+ from ...processing_utils import ImagesKwargs
49
+ from ...utils import TensorType, logging
50
+
51
+
52
+ logger = logging.get_logger(__name__)
53
+
54
+
55
+ class PaddleOCRVLImageProcessorKwargs(ImagesKwargs, total=False):
56
+ r"""
57
+ min_pixels (`int`, *optional*, defaults to `56 * 56`):
58
+ The min pixels of the image to resize the image.
59
+ max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
60
+ The max pixels of the image to resize the image.
61
+ patch_size (`int`, *optional*, defaults to 14):
62
+ The spatial patch size of the vision encoder.
63
+ temporal_patch_size (`int`, *optional*, defaults to 2):
64
+ The temporal patch size of the vision encoder.
65
+ merge_size (`int`, *optional*, defaults to 2):
66
+ The merge size of the vision encoder to llm encoder.
67
+ """
68
+
69
+ min_pixels: int
70
+ max_pixels: int
71
+ patch_size: int
72
+ temporal_patch_size: int
73
+ merge_size: int
74
+
75
+
76
+ def smart_resize(
77
+ height: int,
78
+ width: int,
79
+ factor: int = 28,
80
+ min_pixels: int = 384 * 384,
81
+ max_pixels: int = 1536 * 1536,
82
+ ):
83
+ if height < factor:
84
+ width = round((width * factor) / height)
85
+ height = factor
86
+
87
+ if width < factor:
88
+ height = round((height * factor) / width)
89
+ width = factor
90
+
91
+ if max(height, width) / min(height, width) > 200:
92
+ raise ValueError(
93
+ f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
94
+ )
95
+ h_bar = round(height / factor) * factor
96
+ w_bar = round(width / factor) * factor
97
+ if h_bar * w_bar > max_pixels:
98
+ beta = math.sqrt((height * width) / max_pixels)
99
+ h_bar = math.floor(height / beta / factor) * factor
100
+ w_bar = math.floor(width / beta / factor) * factor
101
+ elif h_bar * w_bar < min_pixels:
102
+ beta = math.sqrt(min_pixels / (height * width))
103
+ h_bar = math.ceil(height * beta / factor) * factor
104
+ w_bar = math.ceil(width * beta / factor) * factor
105
+ return h_bar, w_bar
106
+
107
+
108
+ class PaddleOCRVLImageProcessor(BaseImageProcessor):
109
+ r"""
110
+ Constructs a PaddleOCRVL image processor that dynamically resizes images based on the original images.
111
+
112
+ Args:
113
+ do_resize (`bool`, *optional*, defaults to `True`):
114
+ Whether to resize the image's (height, width) dimensions.
115
+ size (`dict[str, int]`, *optional*):
116
+ Size of the image after resizing. `shortest_edge` and `longest_edge` keys must be present.
117
+ resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
118
+ Resampling filter to use when resizing the image.
119
+ do_rescale (`bool`, *optional*, defaults to `True`):
120
+ Whether to rescale the image by the specified scale `rescale_factor`.
121
+ rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
122
+ Scale factor to use if rescaling the image.
123
+ do_normalize (`bool`, *optional*, defaults to `True`):
124
+ Whether to normalize the image.
125
+ image_mean (`float` or `list[float]`, *optional*):
126
+ Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
127
+ image_std (`float` or `list[float]`, *optional*):
128
+ Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image.
129
+ do_convert_rgb (`bool`, *optional*, defaults to `True`):
130
+ Whether to convert the image to RGB.
131
+ min_pixels (`int`, *optional*, defaults to `384 * 384`):
132
+ The min pixels of the image to resize the image.
133
+ max_pixels (`int`, *optional*, defaults to `1536 * 1536`):
134
+ The max pixels of the image to resize the image.
135
+ patch_size (`int`, *optional*, defaults to 14):
136
+ The spatial patch size of the vision encoder.
137
+ temporal_patch_size (`int`, *optional*, defaults to 1):
138
+ The temporal patch size of the vision encoder.
139
+ merge_size (`int`, *optional*, defaults to 2):
140
+ The merge size of the vision encoder to llm encoder.
141
+ """
142
+
143
+ model_input_names = [
144
+ "pixel_values",
145
+ "image_grid_thw",
146
+ ]
147
+ valid_kwargs = PaddleOCRVLImageProcessorKwargs
148
+
149
+ def __init__(
150
+ self,
151
+ do_resize: bool = True,
152
+ size: Optional[dict[str, int]] = None,
153
+ resample: PILImageResampling = PILImageResampling.BICUBIC,
154
+ do_rescale: bool = True,
155
+ rescale_factor: Union[int, float] = 1 / 255,
156
+ do_normalize: bool = True,
157
+ image_mean: Optional[Union[float, list[float]]] = None,
158
+ image_std: Optional[Union[float, list[float]]] = None,
159
+ do_convert_rgb: bool = True,
160
+ min_pixels: int = 384 * 384,
161
+ max_pixels: int = 1536 * 1536,
162
+ patch_size: int = 14,
163
+ temporal_patch_size: int = 1,
164
+ merge_size: int = 2,
165
+ **kwargs,
166
+ ) -> None:
167
+ super().__init__(**kwargs)
168
+ if size is not None and ("shortest_edge" not in size or "longest_edge" not in size):
169
+ raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
170
+ else:
171
+ size = {"shortest_edge": 56 * 56, "longest_edge": 28 * 28 * 1280}
172
+ # backward compatibility: override size with min_pixels and max_pixels if they are provided
173
+ if min_pixels is not None:
174
+ size["shortest_edge"] = min_pixels
175
+ if max_pixels is not None:
176
+ size["longest_edge"] = max_pixels
177
+ self.min_pixels = size["shortest_edge"]
178
+ self.max_pixels = size["longest_edge"]
179
+ self.size = size
180
+
181
+ self.do_resize = do_resize
182
+ self.resample = resample
183
+ self.do_rescale = do_rescale
184
+ self.rescale_factor = rescale_factor
185
+ self.do_normalize = do_normalize
186
+ self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
187
+ self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
188
+
189
+ self.patch_size = patch_size
190
+ self.temporal_patch_size = temporal_patch_size
191
+ self.merge_size = merge_size
192
+ self.do_convert_rgb = do_convert_rgb
193
+
194
+ def _preprocess(
195
+ self,
196
+ images: ImageInput,
197
+ do_resize: Optional[bool] = None,
198
+ size: Optional[dict[str, int]] = None,
199
+ resample: PILImageResampling = None,
200
+ do_rescale: Optional[bool] = None,
201
+ rescale_factor: Optional[float] = None,
202
+ do_normalize: Optional[bool] = None,
203
+ image_mean: Optional[Union[float, list[float]]] = None,
204
+ image_std: Optional[Union[float, list[float]]] = None,
205
+ patch_size: Optional[int] = None,
206
+ temporal_patch_size: Optional[int] = None,
207
+ merge_size: Optional[int] = None,
208
+ do_convert_rgb: Optional[bool] = None,
209
+ data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
210
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
211
+ ):
212
+ """
213
+ Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
214
+ Args:
215
+ images (`ImageInput`):
216
+ Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
217
+ do_resize (`bool`, *optional*, defaults to `self.do_resize`):
218
+ Whether to resize the image.
219
+ size (`Dict[str, int]`, *optional*, defaults to `self.size`):
220
+ Size of the image after resizing. `shortest_edge` and `longest_edge` keys must be present.
221
+ resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
222
+ Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
223
+ do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
224
+ Whether to rescale the image.
225
+ rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
226
+ Scale factor to use if rescaling the image.
227
+ do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
228
+ Whether to normalize the image.
229
+ image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
230
+ Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
231
+ image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
232
+ Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
233
+ patch_size (`int`, *optional*, defaults to `self.patch_size`):
234
+ The spatial patch size of the vision encoder.
235
+ temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`):
236
+ The temporal patch size of the vision encoder.
237
+ merge_size (`int`, *optional*, defaults to `self.merge_size`):
238
+ The merge size of the vision encoder to llm encoder.
239
+ do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
240
+ Whether to convert the image to RGB.
241
+ data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
242
+ The channel dimension format for the output image. Can be one of:
243
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
244
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
245
+ - Unset: Use the channel dimension format of the input image.
246
+ input_data_format (`ChannelDimension` or `str`, *optional*):
247
+ The channel dimension format for the input image. Can be one of:
248
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
249
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
250
+ - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
251
+ """
252
+ images = make_list_of_images(images)
253
+ images = self.fetch_images(images)
254
+
255
+ if do_convert_rgb:
256
+ images = [convert_to_rgb(image) for image in images]
257
+
258
+ # All transformations expect numpy arrays.
259
+ images = [to_numpy_array(image) for image in images]
260
+
261
+ if is_scaled_image(images[0]) and do_rescale:
262
+ logger.warning_once(
263
+ "It looks like you are trying to rescale already rescaled images. If the input"
264
+ " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
265
+ )
266
+ if input_data_format is None:
267
+ # We assume that all images have the same channel dimension format.
268
+ input_data_format = infer_channel_dimension_format(images[0])
269
+
270
+ height, width = get_image_size(images[0], channel_dim=input_data_format)
271
+ resized_height, resized_width = height, width
272
+ processed_images = []
273
+
274
+ for image in images:
275
+ if do_resize:
276
+ resized_height, resized_width = smart_resize(
277
+ height,
278
+ width,
279
+ factor=patch_size * merge_size,
280
+ min_pixels=size["shortest_edge"],
281
+ max_pixels=size["longest_edge"],
282
+ )
283
+ image = resize(
284
+ image,
285
+ size=(resized_height, resized_width),
286
+ resample=resample,
287
+ input_data_format=input_data_format,
288
+ )
289
+
290
+ if do_rescale:
291
+ image = self.rescale(image, scale=rescale_factor, input_data_format=input_data_format)
292
+
293
+ if do_normalize:
294
+ image = self.normalize(
295
+ image=image,
296
+ mean=image_mean,
297
+ std=image_std,
298
+ input_data_format=input_data_format,
299
+ )
300
+ image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
301
+ processed_images.append(image)
302
+
303
+ patches = np.array(processed_images)
304
+ if data_format == ChannelDimension.LAST:
305
+ patches = patches.transpose(0, 3, 1, 2)
306
+ if patches.shape[0] == 1:
307
+ patches = np.tile(patches, (temporal_patch_size, 1, 1, 1))
308
+
309
+ channel = patches.shape[1]
310
+ grid_t = patches.shape[0] // temporal_patch_size
311
+ grid_h, grid_w = (
312
+ resized_height // patch_size,
313
+ resized_width // patch_size,
314
+ )
315
+ patches = patches.reshape(
316
+ grid_t,
317
+ temporal_patch_size,
318
+ channel,
319
+ grid_h,
320
+ patch_size,
321
+ grid_w,
322
+ patch_size,
323
+ )
324
+ patches = patches.transpose(0, 3, 5, 2, 1, 4, 6)
325
+ if temporal_patch_size != 1:
326
+ raise ValueError(f"temporal_patch_size must be 1!, but got {temporal_patch_size}!")
327
+ flatten_patches = patches.reshape(grid_t * grid_h * grid_w, channel, patch_size, patch_size)
328
+ return flatten_patches, (grid_t, grid_h, grid_w)
329
+
330
+ def preprocess(
331
+ self,
332
+ images: ImageInput,
333
+ do_resize: Optional[bool] = None,
334
+ size: Optional[dict[str, int]] = None,
335
+ min_pixels: Optional[int] = None,
336
+ max_pixels: Optional[int] = None,
337
+ resample: Optional[PILImageResampling] = None,
338
+ do_rescale: Optional[bool] = None,
339
+ rescale_factor: Optional[float] = None,
340
+ do_normalize: Optional[bool] = None,
341
+ image_mean: Optional[Union[float, list[float]]] = None,
342
+ image_std: Optional[Union[float, list[float]]] = None,
343
+ patch_size: Optional[int] = None,
344
+ temporal_patch_size: Optional[int] = None,
345
+ merge_size: Optional[int] = None,
346
+ do_convert_rgb: Optional[bool] = None,
347
+ return_tensors: Optional[Union[str, TensorType]] = None,
348
+ data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
349
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
350
+ ):
351
+ """
352
+ Args:
353
+ images (`ImageInput`):
354
+ Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
355
+ passing in images with pixel values between 0 and 1, set `do_rescale=False`.
356
+ do_resize (`bool`, *optional*, defaults to `self.do_resize`):
357
+ Whether to resize the image.
358
+ size (`dict[str, int]`, *optional*, defaults to `self.size`):
359
+ Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
360
+ the longest edge resized to keep the input aspect ratio.
361
+ resample (`int`, *optional*, defaults to `self.resample`):
362
+ Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
363
+ has an effect if `do_resize` is set to `True`.
364
+ do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
365
+ Whether to rescale the image.
366
+ rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
367
+ Rescale factor to rescale the image by if `do_rescale` is set to `True`.
368
+ do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
369
+ Whether to normalize the image.
370
+ image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
371
+ Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
372
+ image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
373
+ Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
374
+ `True`.
375
+ min_pixels (`int`, *optional*, defaults to `self.min_pixels`):
376
+ The min pixels of the image to resize the image.
377
+ max_pixels (`int`, *optional*, defaults to `self.max_pixels`):
378
+ The max pixels of the image to resize the image.
379
+ patch_size (`int`, *optional*, defaults to `self.patch_size`):
380
+ The spatial patch size of the vision encoder.
381
+ temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`):
382
+ The temporal patch size of the vision encoder.
383
+ merge_size (`int`, *optional*, defaults to `self.merge_size`):
384
+ The merge size of the vision encoder to llm encoder.
385
+ do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
386
+ Whether to convert the image to RGB.
387
+ return_tensors (`str` or `TensorType`, *optional*):
388
+ The type of tensors to return. Can be one of:
389
+ - Unset: Return a list of `np.ndarray`.
390
+ - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
391
+ - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
392
+ data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
393
+ The channel dimension format for the output image. Can be one of:
394
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
395
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
396
+ - Unset: Use the channel dimension format of the input image.
397
+ input_data_format (`ChannelDimension` or `str`, *optional*):
398
+ The channel dimension format for the input image. If unset, the channel dimension format is inferred
399
+ from the input image. Can be one of:
400
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
401
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
402
+ - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
403
+
404
+ """
405
+ min_pixels = min_pixels if min_pixels is not None else self.min_pixels
406
+ max_pixels = max_pixels if max_pixels is not None else self.max_pixels
407
+
408
+ if size is not None:
409
+ if "shortest_edge" not in size or "longest_edge" not in size:
410
+ raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
411
+ min_pixels = size["shortest_edge"]
412
+ elif min_pixels is not None and max_pixels is not None:
413
+ # backward compatibility: override size with min_pixels and max_pixels if they are provided
414
+ size = {"shortest_edge": min_pixels, "longest_edge": max_pixels}
415
+ else:
416
+ size = {**self.size}
417
+
418
+ do_resize = do_resize if do_resize is not None else self.do_resize
419
+
420
+ resample = resample if resample is not None else self.resample
421
+ do_rescale = do_rescale if do_rescale is not None else self.do_rescale
422
+ rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
423
+ do_normalize = do_normalize if do_normalize is not None else self.do_normalize
424
+ image_mean = image_mean if image_mean is not None else self.image_mean
425
+ image_std = image_std if image_std is not None else self.image_std
426
+ patch_size = patch_size if patch_size is not None else self.patch_size
427
+ temporal_patch_size = temporal_patch_size if temporal_patch_size is not None else self.temporal_patch_size
428
+ merge_size = merge_size if merge_size is not None else self.merge_size
429
+ do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
430
+
431
+ if images is not None:
432
+ images = self.fetch_images(images)
433
+ images = make_flat_list_of_images(images)
434
+
435
+ if images is not None and not valid_images(images):
436
+ raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, or torch.Tensor")
437
+
438
+ validate_preprocess_arguments(
439
+ rescale_factor=rescale_factor,
440
+ do_normalize=do_normalize,
441
+ image_mean=image_mean,
442
+ image_std=image_std,
443
+ do_resize=do_resize,
444
+ size=size,
445
+ resample=resample,
446
+ )
447
+
448
+ data = {}
449
+ pixel_values, vision_grid_thws = [], []
450
+ for image in images:
451
+ patches, image_grid_thw = self._preprocess(
452
+ image,
453
+ do_resize=do_resize,
454
+ size=size,
455
+ resample=resample,
456
+ do_rescale=do_rescale,
457
+ rescale_factor=rescale_factor,
458
+ do_normalize=do_normalize,
459
+ image_mean=image_mean,
460
+ image_std=image_std,
461
+ patch_size=patch_size,
462
+ temporal_patch_size=temporal_patch_size,
463
+ merge_size=merge_size,
464
+ data_format=data_format,
465
+ do_convert_rgb=do_convert_rgb,
466
+ input_data_format=input_data_format,
467
+ )
468
+ pixel_values.extend(patches)
469
+ vision_grid_thws.append(image_grid_thw)
470
+ pixel_values = np.array(pixel_values)
471
+ vision_grid_thws = np.array(vision_grid_thws)
472
+ data.update({"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws})
473
+
474
+ return BatchFeature(data=data, tensor_type=return_tensors)
475
+
476
+ def get_number_of_image_patches(self, height: int, width: int, images_kwargs=None):
477
+ """
478
+ A utility that returns number of image patches for a given image size.
479
+
480
+ Args:
481
+ height (`int`):
482
+ Height of the input image.
483
+ width (`int`):
484
+ Width of the input image.
485
+ images_kwargs (`dict`, *optional*)
486
+ Any kwargs to override defaults of the image processor.
487
+ Returns:
488
+ `int`: Number of image patches per image.
489
+ """
490
+ min_pixels = images_kwargs["min_pixels"] if "min_pixels" in images_kwargs else self.size["shortest_edge"]
491
+ max_pixels = images_kwargs["max_pixels"] if "max_pixels" in images_kwargs else self.size["longest_edge"]
492
+ patch_size = images_kwargs.get("patch_size", self.patch_size)
493
+ merge_size = images_kwargs.get("merge_size", self.merge_size)
494
+
495
+ factor = patch_size * merge_size
496
+ resized_height, resized_width = smart_resize(
497
+ height, width, factor, min_pixels=min_pixels, max_pixels=max_pixels
498
+ )
499
+ grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
500
+ return grid_h * grid_w
501
+
502
+
503
+ __all__ = ["PaddleOCRVLImageProcessor"]