transformers 5.0.0rc0__py3-none-any.whl → 5.0.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +30 -3
- transformers/cli/serve.py +47 -17
- transformers/conversion_mapping.py +15 -2
- transformers/convert_slow_tokenizer.py +225 -10
- transformers/core_model_loading.py +196 -135
- transformers/data/data_collator.py +12 -4
- transformers/dependency_versions_table.py +1 -2
- transformers/dynamic_module_utils.py +1 -2
- transformers/feature_extraction_utils.py +1 -2
- transformers/file_utils.py +0 -1
- transformers/generation/__init__.py +11 -1
- transformers/generation/configuration_utils.py +3 -2
- transformers/generation/continuous_batching/__init__.py +4 -0
- transformers/generation/continuous_batching/continuous_api.py +134 -79
- transformers/image_processing_base.py +1 -2
- transformers/integrations/__init__.py +4 -2
- transformers/integrations/accelerate.py +15 -3
- transformers/integrations/aqlm.py +38 -66
- transformers/integrations/awq.py +48 -514
- transformers/integrations/bitnet.py +45 -100
- transformers/integrations/bitsandbytes.py +79 -191
- transformers/integrations/deepspeed.py +1 -0
- transformers/integrations/eetq.py +84 -79
- transformers/integrations/fbgemm_fp8.py +191 -145
- transformers/integrations/finegrained_fp8.py +236 -193
- transformers/integrations/fp_quant.py +92 -0
- transformers/integrations/ggml.py +11 -1
- transformers/integrations/higgs.py +40 -62
- transformers/integrations/hub_kernels.py +42 -3
- transformers/integrations/integration_utils.py +10 -0
- transformers/integrations/mxfp4.py +25 -65
- transformers/integrations/peft.py +7 -29
- transformers/integrations/quanto.py +73 -55
- transformers/integrations/quark.py +55 -0
- transformers/integrations/spqr.py +44 -90
- transformers/integrations/torchao.py +32 -38
- transformers/integrations/vptq.py +42 -59
- transformers/modelcard.py +1 -2
- transformers/modeling_gguf_pytorch_utils.py +8 -0
- transformers/modeling_rope_utils.py +30 -6
- transformers/modeling_utils.py +116 -112
- transformers/models/__init__.py +3 -0
- transformers/models/afmoe/modeling_afmoe.py +4 -4
- transformers/models/albert/tokenization_albert.py +6 -12
- transformers/models/align/modeling_align.py +2 -0
- transformers/models/altclip/modeling_altclip.py +4 -0
- transformers/models/apertus/modeling_apertus.py +4 -4
- transformers/models/arcee/modeling_arcee.py +4 -4
- transformers/models/aria/modeling_aria.py +4 -4
- transformers/models/audioflamingo3/modeling_audioflamingo3.py +1 -0
- transformers/models/audioflamingo3/modular_audioflamingo3.py +1 -0
- transformers/models/auto/configuration_auto.py +11 -0
- transformers/models/auto/feature_extraction_auto.py +2 -0
- transformers/models/auto/image_processing_auto.py +1 -0
- transformers/models/auto/modeling_auto.py +6 -0
- transformers/models/auto/processing_auto.py +18 -10
- transformers/models/auto/tokenization_auto.py +74 -472
- transformers/models/autoformer/modeling_autoformer.py +4 -0
- transformers/models/bamba/modeling_bamba.py +4 -3
- transformers/models/bark/modeling_bark.py +2 -0
- transformers/models/bart/modeling_bart.py +7 -0
- transformers/models/barthez/tokenization_barthez.py +5 -10
- transformers/models/beit/modeling_beit.py +6 -1
- transformers/models/bert/tokenization_bert.py +8 -21
- transformers/models/big_bird/modeling_big_bird.py +6 -0
- transformers/models/big_bird/tokenization_big_bird.py +18 -42
- transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +8 -2
- transformers/models/biogpt/modeling_biogpt.py +2 -0
- transformers/models/biogpt/modular_biogpt.py +2 -0
- transformers/models/bit/modeling_bit.py +11 -2
- transformers/models/bitnet/modeling_bitnet.py +4 -4
- transformers/models/blenderbot/modeling_blenderbot.py +5 -0
- transformers/models/blenderbot/tokenization_blenderbot.py +12 -16
- transformers/models/blenderbot_small/modeling_blenderbot_small.py +5 -0
- transformers/models/blip/modeling_blip_text.py +2 -0
- transformers/models/blip_2/modeling_blip_2.py +2 -1
- transformers/models/bloom/modeling_bloom.py +4 -0
- transformers/models/blt/modeling_blt.py +2 -2
- transformers/models/blt/modular_blt.py +2 -2
- transformers/models/bridgetower/modeling_bridgetower.py +5 -1
- transformers/models/bros/modeling_bros.py +4 -0
- transformers/models/camembert/tokenization_camembert.py +8 -12
- transformers/models/canine/modeling_canine.py +5 -0
- transformers/models/chameleon/modeling_chameleon.py +2 -1
- transformers/models/chinese_clip/modeling_chinese_clip.py +3 -0
- transformers/models/clap/modeling_clap.py +5 -0
- transformers/models/clip/tokenization_clip.py +22 -44
- transformers/models/clipseg/modeling_clipseg.py +5 -0
- transformers/models/clvp/modeling_clvp.py +5 -0
- transformers/models/clvp/tokenization_clvp.py +1 -63
- transformers/models/code_llama/tokenization_code_llama.py +20 -43
- transformers/models/codegen/tokenization_codegen.py +14 -43
- transformers/models/cohere/modeling_cohere.py +4 -3
- transformers/models/cohere/modular_cohere.py +2 -1
- transformers/models/cohere/tokenization_cohere.py +12 -42
- transformers/models/cohere2/modeling_cohere2.py +7 -6
- transformers/models/cohere2/modular_cohere2.py +5 -5
- transformers/models/cohere2_vision/image_processing_cohere2_vision_fast.py +4 -3
- transformers/models/cohere2_vision/modular_cohere2_vision.py +4 -3
- transformers/models/colqwen2/modeling_colqwen2.py +1 -0
- transformers/models/colqwen2/modular_colqwen2.py +1 -0
- transformers/models/conditional_detr/modeling_conditional_detr.py +5 -0
- transformers/models/convbert/modeling_convbert.py +6 -0
- transformers/models/convnext/modeling_convnext.py +2 -4
- transformers/models/convnextv2/modeling_convnextv2.py +2 -4
- transformers/models/csm/modeling_csm.py +4 -3
- transformers/models/ctrl/modeling_ctrl.py +1 -0
- transformers/models/cvt/modeling_cvt.py +2 -0
- transformers/models/cwm/modeling_cwm.py +4 -4
- transformers/models/d_fine/modeling_d_fine.py +2 -0
- transformers/models/d_fine/modular_d_fine.py +1 -0
- transformers/models/dab_detr/modeling_dab_detr.py +4 -0
- transformers/models/dac/modeling_dac.py +2 -2
- transformers/models/data2vec/modeling_data2vec_audio.py +5 -0
- transformers/models/data2vec/modeling_data2vec_vision.py +4 -1
- transformers/models/dbrx/modeling_dbrx.py +2 -2
- transformers/models/deberta/modeling_deberta.py +5 -0
- transformers/models/deberta/tokenization_deberta.py +11 -20
- transformers/models/deberta_v2/modeling_deberta_v2.py +6 -0
- transformers/models/deberta_v2/tokenization_deberta_v2.py +13 -28
- transformers/models/decision_transformer/modeling_decision_transformer.py +4 -1
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +2 -3
- transformers/models/deepseek_v2/modular_deepseek_v2.py +2 -2
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +3 -2
- transformers/models/deepseek_v3/modular_deepseek_v3.py +1 -0
- transformers/models/deformable_detr/modeling_deformable_detr.py +4 -0
- transformers/models/depth_anything/modeling_depth_anything.py +1 -0
- transformers/models/depth_pro/modeling_depth_pro.py +2 -0
- transformers/models/detr/modeling_detr.py +5 -0
- transformers/models/dia/modeling_dia.py +4 -3
- transformers/models/dia/modular_dia.py +0 -1
- transformers/models/diffllama/modeling_diffllama.py +2 -2
- transformers/models/dinat/modeling_dinat.py +3 -0
- transformers/models/dinov3_convnext/modeling_dinov3_convnext.py +1 -1
- transformers/models/dinov3_vit/modeling_dinov3_vit.py +2 -2
- transformers/models/dinov3_vit/modular_dinov3_vit.py +2 -2
- transformers/models/distilbert/tokenization_distilbert.py +13 -0
- transformers/models/doge/modeling_doge.py +2 -3
- transformers/models/doge/modular_doge.py +0 -1
- transformers/models/donut/modeling_donut_swin.py +2 -0
- transformers/models/dots1/modeling_dots1.py +10 -7
- transformers/models/dots1/modular_dots1.py +5 -3
- transformers/models/dpr/modeling_dpr.py +5 -0
- transformers/models/dpr/tokenization_dpr.py +12 -0
- transformers/models/edgetam/modeling_edgetam.py +1 -1
- transformers/models/edgetam_video/modeling_edgetam_video.py +1 -0
- transformers/models/edgetam_video/modular_edgetam_video.py +1 -0
- transformers/models/efficientloftr/modeling_efficientloftr.py +2 -2
- transformers/models/efficientnet/modeling_efficientnet.py +2 -0
- transformers/models/emu3/modeling_emu3.py +4 -4
- transformers/models/eomt/image_processing_eomt.py +13 -1
- transformers/models/eomt/image_processing_eomt_fast.py +14 -2
- transformers/models/ernie4_5/modeling_ernie4_5.py +4 -4
- transformers/models/ernie4_5/modular_ernie4_5.py +2 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +5 -5
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +2 -2
- transformers/models/esm/modeling_esmfold.py +5 -4
- transformers/models/evolla/modeling_evolla.py +4 -4
- transformers/models/exaone4/modeling_exaone4.py +2 -2
- transformers/models/exaone4/modular_exaone4.py +0 -1
- transformers/models/falcon/modeling_falcon.py +6 -1
- transformers/models/falcon_h1/modeling_falcon_h1.py +4 -3
- transformers/models/falcon_mamba/modeling_falcon_mamba.py +25 -35
- transformers/models/falcon_mamba/modular_falcon_mamba.py +12 -31
- transformers/{kernels/falcon_mamba → models/fast_vlm}/__init__.py +15 -3
- transformers/models/fast_vlm/configuration_fast_vlm.py +137 -0
- transformers/models/fast_vlm/modeling_fast_vlm.py +455 -0
- transformers/models/fast_vlm/modular_fast_vlm.py +273 -0
- transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py +8 -3
- transformers/models/flaubert/modeling_flaubert.py +7 -0
- transformers/models/flava/modeling_flava.py +6 -1
- transformers/models/flex_olmo/modeling_flex_olmo.py +4 -5
- transformers/models/florence2/modeling_florence2.py +2 -1
- transformers/models/florence2/modular_florence2.py +2 -1
- transformers/models/fnet/modeling_fnet.py +7 -0
- transformers/models/focalnet/modeling_focalnet.py +4 -0
- transformers/models/fsmt/modeling_fsmt.py +2 -0
- transformers/models/funnel/modeling_funnel.py +8 -0
- transformers/models/funnel/tokenization_funnel.py +17 -24
- transformers/models/fuyu/processing_fuyu.py +3 -3
- transformers/models/gemma/modeling_gemma.py +4 -4
- transformers/models/gemma/tokenization_gemma.py +10 -27
- transformers/models/gemma2/modeling_gemma2.py +4 -4
- transformers/models/gemma2/modular_gemma2.py +2 -1
- transformers/models/gemma3/modeling_gemma3.py +14 -84
- transformers/models/gemma3/modular_gemma3.py +12 -81
- transformers/models/gemma3n/modeling_gemma3n.py +18 -209
- transformers/models/gemma3n/modular_gemma3n.py +17 -59
- transformers/models/git/modeling_git.py +2 -0
- transformers/models/glm/modeling_glm.py +4 -4
- transformers/models/glm4/modeling_glm4.py +4 -4
- transformers/models/glm4_moe/modeling_glm4_moe.py +5 -3
- transformers/models/glm4v/configuration_glm4v.py +3 -1
- transformers/models/glm4v/modeling_glm4v.py +3 -3
- transformers/models/glm4v/modular_glm4v.py +6 -4
- transformers/models/glm4v_moe/configuration_glm4v_moe.py +3 -1
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +6 -5
- transformers/models/glm4v_moe/modular_glm4v_moe.py +1 -1
- transformers/models/glpn/modeling_glpn.py +2 -0
- transformers/models/gpt2/modeling_gpt2.py +5 -1
- transformers/models/gpt2/tokenization_gpt2.py +16 -44
- transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +1 -0
- transformers/models/gpt_neo/modeling_gpt_neo.py +4 -0
- transformers/models/gpt_neox/modeling_gpt_neox.py +5 -2
- transformers/models/gpt_neox/modular_gpt_neox.py +3 -0
- transformers/models/gpt_neox/tokenization_gpt_neox.py +10 -49
- transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +3 -1
- transformers/models/gpt_oss/modeling_gpt_oss.py +5 -6
- transformers/models/gpt_oss/modular_gpt_oss.py +3 -5
- transformers/models/gptj/modeling_gptj.py +3 -0
- transformers/models/granite/modeling_granite.py +4 -4
- transformers/models/granitemoe/modeling_granitemoe.py +4 -6
- transformers/models/granitemoe/modular_granitemoe.py +0 -2
- transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +4 -6
- transformers/models/granitemoeshared/modeling_granitemoeshared.py +4 -6
- transformers/models/grounding_dino/modeling_grounding_dino.py +4 -0
- transformers/models/groupvit/modeling_groupvit.py +3 -0
- transformers/models/helium/modeling_helium.py +4 -3
- transformers/models/herbert/tokenization_herbert.py +9 -25
- transformers/models/hgnet_v2/modeling_hgnet_v2.py +6 -1
- transformers/models/hgnet_v2/modular_hgnet_v2.py +6 -1
- transformers/models/hiera/modeling_hiera.py +4 -0
- transformers/models/hubert/modeling_hubert.py +3 -0
- transformers/models/hubert/modular_hubert.py +1 -0
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +4 -4
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +4 -4
- transformers/models/ibert/modeling_ibert.py +6 -0
- transformers/models/idefics/modeling_idefics.py +5 -21
- transformers/models/imagegpt/modeling_imagegpt.py +2 -1
- transformers/models/informer/modeling_informer.py +4 -0
- transformers/models/informer/modular_informer.py +1 -0
- transformers/models/internvl/modeling_internvl.py +2 -4
- transformers/models/internvl/modular_internvl.py +2 -4
- transformers/models/jamba/modeling_jamba.py +2 -2
- transformers/models/janus/modeling_janus.py +1 -0
- transformers/models/janus/modular_janus.py +1 -0
- transformers/models/jetmoe/modeling_jetmoe.py +2 -2
- transformers/models/kosmos2/modeling_kosmos2.py +1 -0
- transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +3 -1
- transformers/models/lasr/__init__.py +29 -0
- transformers/models/lasr/configuration_lasr.py +244 -0
- transformers/models/lasr/feature_extraction_lasr.py +277 -0
- transformers/models/lasr/modeling_lasr.py +729 -0
- transformers/models/lasr/modular_lasr.py +569 -0
- transformers/models/lasr/processing_lasr.py +96 -0
- transformers/models/lasr/tokenization_lasr.py +186 -0
- transformers/models/layoutlm/modeling_layoutlm.py +5 -0
- transformers/models/layoutlmv2/modeling_layoutlmv2.py +4 -0
- transformers/models/layoutlmv2/tokenization_layoutlmv2.py +10 -53
- transformers/models/layoutlmv3/modeling_layoutlmv3.py +4 -0
- transformers/models/layoutlmv3/tokenization_layoutlmv3.py +12 -61
- transformers/models/layoutxlm/tokenization_layoutxlm.py +13 -38
- transformers/models/led/modeling_led.py +6 -0
- transformers/models/levit/modeling_levit.py +3 -0
- transformers/models/lfm2/modeling_lfm2.py +4 -5
- transformers/models/lfm2/modular_lfm2.py +0 -1
- transformers/models/lfm2_moe/modeling_lfm2_moe.py +4 -5
- transformers/models/lightglue/modeling_lightglue.py +3 -1
- transformers/models/lightglue/modular_lightglue.py +1 -0
- transformers/models/lilt/modeling_lilt.py +4 -0
- transformers/models/llama/modeling_llama.py +4 -4
- transformers/models/llama/tokenization_llama.py +15 -43
- transformers/models/llama4/modeling_llama4.py +3 -2
- transformers/models/longcat_flash/modeling_longcat_flash.py +4 -4
- transformers/models/longcat_flash/modular_longcat_flash.py +2 -2
- transformers/models/longformer/modeling_longformer.py +6 -0
- transformers/models/longt5/modeling_longt5.py +4 -0
- transformers/models/luke/modeling_luke.py +9 -0
- transformers/models/luke/tokenization_luke.py +11 -38
- transformers/models/lxmert/modeling_lxmert.py +2 -0
- transformers/models/m2m_100/modeling_m2m_100.py +4 -0
- transformers/models/mamba/modeling_mamba.py +14 -22
- transformers/models/marian/modeling_marian.py +5 -0
- transformers/models/markuplm/modeling_markuplm.py +4 -0
- transformers/models/markuplm/tokenization_markuplm.py +28 -61
- transformers/models/mask2former/modeling_mask2former.py +2 -0
- transformers/models/maskformer/modeling_maskformer.py +2 -0
- transformers/models/maskformer/modeling_maskformer_swin.py +2 -0
- transformers/models/mbart/modeling_mbart.py +7 -0
- transformers/models/mbart/tokenization_mbart.py +11 -52
- transformers/models/mbart50/tokenization_mbart50.py +7 -10
- transformers/models/megatron_bert/modeling_megatron_bert.py +7 -0
- transformers/models/mgp_str/modeling_mgp_str.py +2 -0
- transformers/models/mimi/modeling_mimi.py +3 -1
- transformers/models/minimax/modeling_minimax.py +4 -4
- transformers/models/ministral/modeling_ministral.py +4 -4
- transformers/models/ministral3/configuration_ministral3.py +1 -1
- transformers/models/ministral3/modeling_ministral3.py +4 -3
- transformers/models/mistral/modeling_mistral.py +4 -3
- transformers/models/mixtral/modeling_mixtral.py +4 -4
- transformers/models/mllama/modeling_mllama.py +2 -2
- transformers/models/mluke/tokenization_mluke.py +6 -6
- transformers/models/mm_grounding_dino/modeling_mm_grounding_dino.py +4 -0
- transformers/models/mobilenet_v1/modeling_mobilenet_v1.py +2 -0
- transformers/models/mobilenet_v2/modeling_mobilenet_v2.py +3 -0
- transformers/models/mobilevit/modeling_mobilevit.py +3 -0
- transformers/models/mobilevitv2/modeling_mobilevitv2.py +3 -0
- transformers/models/modernbert/modeling_modernbert.py +4 -1
- transformers/models/modernbert/modular_modernbert.py +2 -0
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +8 -9
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +6 -7
- transformers/models/moonshine/modeling_moonshine.py +4 -2
- transformers/models/moshi/modeling_moshi.py +5 -2
- transformers/models/mpnet/modeling_mpnet.py +5 -0
- transformers/models/mpnet/tokenization_mpnet.py +5 -13
- transformers/models/mpt/modeling_mpt.py +2 -0
- transformers/models/mra/modeling_mra.py +6 -0
- transformers/models/mt5/modeling_mt5.py +7 -0
- transformers/models/musicgen/modeling_musicgen.py +2 -0
- transformers/models/musicgen_melody/modeling_musicgen_melody.py +3 -0
- transformers/models/mvp/modeling_mvp.py +7 -0
- transformers/models/nanochat/modeling_nanochat.py +4 -4
- transformers/models/nemotron/modeling_nemotron.py +4 -2
- transformers/models/nllb/tokenization_nllb.py +8 -22
- transformers/models/nougat/tokenization_nougat.py +11 -59
- transformers/models/nystromformer/modeling_nystromformer.py +6 -0
- transformers/models/olmo/modeling_olmo.py +4 -4
- transformers/models/olmo/modular_olmo.py +2 -2
- transformers/models/olmo2/modeling_olmo2.py +4 -5
- transformers/models/olmo2/modular_olmo2.py +0 -1
- transformers/models/olmo3/modeling_olmo3.py +4 -4
- transformers/models/olmoe/modeling_olmoe.py +4 -4
- transformers/models/omdet_turbo/modeling_omdet_turbo.py +2 -0
- transformers/models/oneformer/modeling_oneformer.py +4 -1
- transformers/models/openai/modeling_openai.py +3 -0
- transformers/models/openai/tokenization_openai.py +10 -46
- transformers/models/opt/modeling_opt.py +2 -0
- transformers/models/owlv2/modeling_owlv2.py +4 -0
- transformers/models/owlvit/modeling_owlvit.py +4 -0
- transformers/models/paddleocr_vl/__init__.py +32 -0
- transformers/models/paddleocr_vl/configuration_paddleocr_vl.py +336 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl.py +503 -0
- transformers/models/paddleocr_vl/image_processing_paddleocr_vl_fast.py +209 -0
- transformers/models/paddleocr_vl/modeling_paddleocr_vl.py +1668 -0
- transformers/models/paddleocr_vl/modular_paddleocr_vl.py +1349 -0
- transformers/models/paddleocr_vl/processing_paddleocr_vl.py +135 -0
- transformers/models/parakeet/configuration_parakeet.py +4 -6
- transformers/models/parakeet/modeling_parakeet.py +9 -6
- transformers/models/parakeet/modular_parakeet.py +2 -2
- transformers/models/parakeet/processing_parakeet.py +1 -0
- transformers/models/patchtsmixer/modeling_patchtsmixer.py +6 -0
- transformers/models/patchtst/modeling_patchtst.py +20 -2
- transformers/models/pegasus/modeling_pegasus.py +5 -0
- transformers/models/pegasus/tokenization_pegasus.py +17 -44
- transformers/models/pegasus_x/modeling_pegasus_x.py +4 -0
- transformers/models/perceiver/modeling_perceiver.py +8 -0
- transformers/models/persimmon/modeling_persimmon.py +2 -1
- transformers/models/phi/modeling_phi.py +4 -5
- transformers/models/phi/modular_phi.py +0 -1
- transformers/models/phi3/modeling_phi3.py +2 -1
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +5 -5
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +4 -4
- transformers/models/phimoe/modeling_phimoe.py +4 -4
- transformers/models/phimoe/modular_phimoe.py +2 -2
- transformers/models/pix2struct/modeling_pix2struct.py +2 -0
- transformers/models/pixtral/modeling_pixtral.py +2 -1
- transformers/models/plbart/modeling_plbart.py +6 -0
- transformers/models/plbart/modular_plbart.py +2 -0
- transformers/models/plbart/tokenization_plbart.py +0 -2
- transformers/models/poolformer/modeling_poolformer.py +2 -0
- transformers/models/pop2piano/modeling_pop2piano.py +2 -0
- transformers/models/prompt_depth_anything/modeling_prompt_depth_anything.py +1 -0
- transformers/models/prompt_depth_anything/modular_prompt_depth_anything.py +1 -0
- transformers/models/prophetnet/modeling_prophetnet.py +3 -0
- transformers/models/pvt/modeling_pvt.py +2 -0
- transformers/models/pvt_v2/modeling_pvt_v2.py +3 -0
- transformers/models/qwen2/modeling_qwen2.py +4 -4
- transformers/models/qwen2/tokenization_qwen2.py +14 -18
- transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +4 -2
- transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +13 -16
- transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +14 -16
- transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +1 -1
- transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +5 -6
- transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +3 -5
- transformers/models/qwen2_audio/modeling_qwen2_audio.py +2 -0
- transformers/models/qwen2_moe/modeling_qwen2_moe.py +4 -4
- transformers/models/qwen2_vl/configuration_qwen2_vl.py +1 -1
- transformers/models/qwen2_vl/modeling_qwen2_vl.py +6 -16
- transformers/models/qwen3/modeling_qwen3.py +4 -4
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +4 -4
- transformers/models/qwen3_next/modeling_qwen3_next.py +4 -3
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +21 -23
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +14 -16
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +39 -37
- transformers/models/qwen3_vl/modular_qwen3_vl.py +37 -35
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +39 -37
- transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +4 -1
- transformers/models/rag/modeling_rag.py +1 -0
- transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +15 -1
- transformers/models/reformer/modeling_reformer.py +4 -0
- transformers/models/reformer/tokenization_reformer.py +11 -28
- transformers/models/regnet/modeling_regnet.py +6 -1
- transformers/models/rembert/modeling_rembert.py +6 -0
- transformers/models/rembert/tokenization_rembert.py +3 -10
- transformers/models/resnet/modeling_resnet.py +11 -2
- transformers/models/roberta/tokenization_roberta.py +18 -27
- transformers/models/roformer/modeling_roformer.py +6 -0
- transformers/models/roformer/tokenization_roformer.py +77 -412
- transformers/models/rt_detr/modeling_rt_detr.py +2 -0
- transformers/models/rt_detr/modeling_rt_detr_resnet.py +5 -1
- transformers/models/rt_detr_v2/modeling_rt_detr_v2.py +2 -0
- transformers/models/rwkv/modeling_rwkv.py +1 -0
- transformers/models/sam2/modeling_sam2.py +2 -2
- transformers/models/sam2/modular_sam2.py +2 -2
- transformers/models/sam2_video/modeling_sam2_video.py +1 -0
- transformers/models/sam2_video/modular_sam2_video.py +1 -0
- transformers/models/sam3/modeling_sam3.py +77 -80
- transformers/models/sam3_tracker/modeling_sam3_tracker.py +6 -1
- transformers/models/sam3_tracker/modular_sam3_tracker.py +6 -1
- transformers/models/sam3_tracker_video/modeling_sam3_tracker_video.py +1 -0
- transformers/models/sam3_video/modeling_sam3_video.py +1 -0
- transformers/models/seamless_m4t/modeling_seamless_m4t.py +5 -1
- transformers/models/seamless_m4t/tokenization_seamless_m4t.py +27 -59
- transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py +5 -1
- transformers/models/seed_oss/modeling_seed_oss.py +2 -2
- transformers/models/segformer/modeling_segformer.py +4 -1
- transformers/models/seggpt/modeling_seggpt.py +2 -0
- transformers/models/sew/modeling_sew.py +3 -0
- transformers/models/sew/modular_sew.py +1 -0
- transformers/models/sew_d/modeling_sew_d.py +3 -0
- transformers/models/siglip2/modeling_siglip2.py +4 -0
- transformers/models/siglip2/modular_siglip2.py +4 -0
- transformers/models/smollm3/modeling_smollm3.py +4 -4
- transformers/models/smolvlm/processing_smolvlm.py +0 -7
- transformers/models/speech_to_text/modeling_speech_to_text.py +4 -0
- transformers/models/speecht5/modeling_speecht5.py +13 -1
- transformers/models/splinter/modeling_splinter.py +3 -0
- transformers/models/splinter/tokenization_splinter.py +9 -28
- transformers/models/squeezebert/modeling_squeezebert.py +6 -0
- transformers/models/stablelm/modeling_stablelm.py +3 -1
- transformers/models/starcoder2/modeling_starcoder2.py +4 -3
- transformers/models/superglue/modeling_superglue.py +1 -0
- transformers/models/superpoint/modeling_superpoint.py +1 -0
- transformers/models/swiftformer/modeling_swiftformer.py +2 -0
- transformers/models/swin/modeling_swin.py +4 -0
- transformers/models/swin2sr/modeling_swin2sr.py +2 -0
- transformers/models/swinv2/modeling_swinv2.py +4 -0
- transformers/models/t5/modeling_t5.py +7 -0
- transformers/models/t5/tokenization_t5.py +4 -8
- transformers/models/t5gemma/modeling_t5gemma.py +5 -5
- transformers/models/t5gemma2/modeling_t5gemma2.py +6 -6
- transformers/models/table_transformer/modeling_table_transformer.py +4 -0
- transformers/models/tapas/modeling_tapas.py +3 -0
- transformers/models/textnet/modeling_textnet.py +11 -2
- transformers/models/time_series_transformer/modeling_time_series_transformer.py +4 -0
- transformers/models/timesfm/modeling_timesfm.py +2 -0
- transformers/models/timesfm/modular_timesfm.py +2 -0
- transformers/models/timesformer/modeling_timesformer.py +2 -0
- transformers/models/timm_wrapper/modeling_timm_wrapper.py +1 -1
- transformers/models/trocr/modeling_trocr.py +2 -0
- transformers/models/tvp/modeling_tvp.py +2 -0
- transformers/models/udop/modeling_udop.py +4 -0
- transformers/models/udop/tokenization_udop.py +5 -13
- transformers/models/umt5/modeling_umt5.py +7 -0
- transformers/models/unispeech/modeling_unispeech.py +4 -0
- transformers/models/unispeech/modular_unispeech.py +2 -0
- transformers/models/unispeech_sat/modeling_unispeech_sat.py +6 -0
- transformers/models/unispeech_sat/modular_unispeech_sat.py +2 -0
- transformers/models/univnet/modeling_univnet.py +1 -0
- transformers/models/upernet/modeling_upernet.py +1 -0
- transformers/models/vaultgemma/modeling_vaultgemma.py +4 -4
- transformers/models/vilt/modeling_vilt.py +6 -0
- transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +1 -0
- transformers/models/visual_bert/modeling_visual_bert.py +6 -0
- transformers/models/vitdet/modeling_vitdet.py +2 -0
- transformers/models/vitmatte/modeling_vitmatte.py +1 -0
- transformers/models/vits/modeling_vits.py +1 -0
- transformers/models/vjepa2/modeling_vjepa2.py +1 -0
- transformers/models/wav2vec2/modeling_wav2vec2.py +7 -0
- transformers/models/wav2vec2_bert/modeling_wav2vec2_bert.py +5 -0
- transformers/models/wav2vec2_bert/modular_wav2vec2_bert.py +5 -0
- transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +6 -0
- transformers/models/wavlm/modeling_wavlm.py +5 -0
- transformers/models/whisper/modeling_whisper.py +6 -0
- transformers/models/whisper/tokenization_whisper.py +4 -15
- transformers/models/x_clip/modeling_x_clip.py +3 -0
- transformers/models/xglm/modeling_xglm.py +1 -0
- transformers/models/xglm/tokenization_xglm.py +4 -9
- transformers/models/xlm/modeling_xlm.py +5 -0
- transformers/models/xlm_roberta/tokenization_xlm_roberta.py +9 -16
- transformers/models/xlnet/tokenization_xlnet.py +3 -7
- transformers/models/yoso/modeling_yoso.py +6 -0
- transformers/models/zamba/modeling_zamba.py +2 -0
- transformers/models/zamba2/modeling_zamba2.py +4 -2
- transformers/models/zamba2/modular_zamba2.py +1 -1
- transformers/models/zoedepth/modeling_zoedepth.py +1 -0
- transformers/pipelines/__init__.py +2 -3
- transformers/pipelines/base.py +1 -9
- transformers/pipelines/document_question_answering.py +3 -1
- transformers/pipelines/text_generation.py +1 -1
- transformers/processing_utils.py +23 -11
- transformers/quantizers/base.py +35 -110
- transformers/quantizers/quantizer_aqlm.py +1 -5
- transformers/quantizers/quantizer_auto_round.py +1 -2
- transformers/quantizers/quantizer_awq.py +17 -81
- transformers/quantizers/quantizer_bitnet.py +3 -8
- transformers/quantizers/quantizer_bnb_4bit.py +13 -110
- transformers/quantizers/quantizer_bnb_8bit.py +16 -92
- transformers/quantizers/quantizer_compressed_tensors.py +1 -5
- transformers/quantizers/quantizer_eetq.py +14 -62
- transformers/quantizers/quantizer_fbgemm_fp8.py +34 -125
- transformers/quantizers/quantizer_finegrained_fp8.py +13 -105
- transformers/quantizers/quantizer_fp_quant.py +48 -78
- transformers/quantizers/quantizer_gptq.py +7 -24
- transformers/quantizers/quantizer_higgs.py +40 -54
- transformers/quantizers/quantizer_hqq.py +144 -153
- transformers/quantizers/quantizer_mxfp4.py +13 -167
- transformers/quantizers/quantizer_quanto.py +20 -64
- transformers/quantizers/quantizer_quark.py +36 -17
- transformers/quantizers/quantizer_spqr.py +1 -4
- transformers/quantizers/quantizer_torchao.py +23 -202
- transformers/quantizers/quantizer_vptq.py +8 -22
- transformers/quantizers/quantizers_utils.py +20 -0
- transformers/testing_utils.py +297 -36
- transformers/tokenization_mistral_common.py +4 -0
- transformers/tokenization_utils_base.py +113 -222
- transformers/tokenization_utils_tokenizers.py +168 -107
- transformers/trainer.py +28 -31
- transformers/trainer_jit_checkpoint.py +126 -0
- transformers/trainer_utils.py +1 -1
- transformers/training_args.py +66 -28
- transformers/utils/__init__.py +3 -4
- transformers/utils/auto_docstring.py +1 -0
- transformers/utils/generic.py +27 -1
- transformers/utils/hub.py +5 -15
- transformers/utils/import_utils.py +61 -16
- transformers/utils/kernel_config.py +4 -2
- transformers/utils/loading_report.py +19 -10
- transformers/utils/quantization_config.py +75 -242
- transformers/video_processing_utils.py +1 -2
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/METADATA +274 -227
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/RECORD +536 -520
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/WHEEL +1 -1
- transformers/kernels/__init__.py +0 -0
- transformers/kernels/falcon_mamba/selective_scan_with_ln_interface.py +0 -529
- transformers/models/roformer/tokenization_roformer_fast.py +0 -160
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/entry_points.txt +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info/licenses}/LICENSE +0 -0
- {transformers-5.0.0rc0.dist-info → transformers-5.0.0rc1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,1349 @@
|
|
|
1
|
+
# Copyright 2025 The PaddlePaddle Team and The HuggingFace Inc. team. All rights reserved.
|
|
2
|
+
#
|
|
3
|
+
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
|
|
4
|
+
# and OPT implementations in this library. It has been modified from its
|
|
5
|
+
# original forms to accommodate minor architectural differences compared
|
|
6
|
+
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
|
|
7
|
+
#
|
|
8
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
9
|
+
# you may not use this file except in compliance with the License.
|
|
10
|
+
# You may obtain a copy of the License at
|
|
11
|
+
#
|
|
12
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
13
|
+
#
|
|
14
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
15
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
16
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
17
|
+
# See the License for the specific language governing permissions and
|
|
18
|
+
# limitations under the License.
|
|
19
|
+
|
|
20
|
+
import math
|
|
21
|
+
from typing import Optional, Union
|
|
22
|
+
|
|
23
|
+
import numpy as np
|
|
24
|
+
import torch
|
|
25
|
+
import torch.nn.functional as F
|
|
26
|
+
from torch import nn
|
|
27
|
+
|
|
28
|
+
from ...activations import GELUActivation
|
|
29
|
+
from ...cache_utils import Cache, DynamicCache
|
|
30
|
+
from ...image_processing_utils import BatchFeature
|
|
31
|
+
from ...image_processing_utils_fast import BaseImageProcessorFast, group_images_by_shape, reorder_images
|
|
32
|
+
from ...image_transforms import convert_to_rgb, resize, to_channel_dimension_format
|
|
33
|
+
from ...image_utils import (
|
|
34
|
+
OPENAI_CLIP_MEAN,
|
|
35
|
+
OPENAI_CLIP_STD,
|
|
36
|
+
ChannelDimension,
|
|
37
|
+
ImageInput,
|
|
38
|
+
PILImageResampling,
|
|
39
|
+
SizeDict,
|
|
40
|
+
get_image_size,
|
|
41
|
+
infer_channel_dimension_format,
|
|
42
|
+
is_scaled_image,
|
|
43
|
+
make_list_of_images,
|
|
44
|
+
to_numpy_array,
|
|
45
|
+
)
|
|
46
|
+
from ...masking_utils import create_bidirectional_mask, create_causal_mask
|
|
47
|
+
from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPast, BaseModelOutputWithPooling
|
|
48
|
+
from ...modeling_utils import PreTrainedModel
|
|
49
|
+
from ...models.qwen2_vl.image_processing_qwen2_vl import Qwen2VLImageProcessor
|
|
50
|
+
from ...processing_utils import (
|
|
51
|
+
ProcessingKwargs,
|
|
52
|
+
ProcessorMixin,
|
|
53
|
+
Unpack,
|
|
54
|
+
)
|
|
55
|
+
from ...tokenization_utils_base import PreTokenizedInput, TextInput
|
|
56
|
+
from ...utils import TensorType, TransformersKwargs, auto_docstring, can_return_tuple, logging, torch_int
|
|
57
|
+
from ...utils.generic import check_model_inputs
|
|
58
|
+
from ..ernie4_5.configuration_ernie4_5 import Ernie4_5Config
|
|
59
|
+
from ..ernie4_5.modeling_ernie4_5 import (
|
|
60
|
+
Ernie4_5DecoderLayer,
|
|
61
|
+
Ernie4_5MLP,
|
|
62
|
+
Ernie4_5Model,
|
|
63
|
+
Ernie4_5RMSNorm,
|
|
64
|
+
)
|
|
65
|
+
from ..qwen2_5_omni.modeling_qwen2_5_omni import (
|
|
66
|
+
Qwen2_5OmniAttention,
|
|
67
|
+
)
|
|
68
|
+
from ..qwen2_vl.configuration_qwen2_vl import Qwen2VLConfig
|
|
69
|
+
from ..qwen2_vl.modeling_qwen2_vl import (
|
|
70
|
+
Qwen2VLCausalLMOutputWithPast,
|
|
71
|
+
Qwen2VLForConditionalGeneration,
|
|
72
|
+
Qwen2VLModel,
|
|
73
|
+
Qwen2VLModelOutputWithPast,
|
|
74
|
+
Qwen2VLRotaryEmbedding,
|
|
75
|
+
VisionRotaryEmbedding,
|
|
76
|
+
)
|
|
77
|
+
from ..siglip.configuration_siglip import SiglipVisionConfig
|
|
78
|
+
from ..siglip.modeling_siglip import (
|
|
79
|
+
SiglipMLP,
|
|
80
|
+
SiglipVisionEmbeddings,
|
|
81
|
+
)
|
|
82
|
+
from ..video_llama_3.modeling_video_llama_3 import (
|
|
83
|
+
VideoLlama3VisionAttention,
|
|
84
|
+
VideoLlama3VisionEncoder,
|
|
85
|
+
VideoLlama3VisionEncoderLayer,
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
logger = logging.get_logger(__name__)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def smart_resize(
|
|
93
|
+
height: int,
|
|
94
|
+
width: int,
|
|
95
|
+
factor: int = 28,
|
|
96
|
+
min_pixels: int = 384 * 384,
|
|
97
|
+
max_pixels: int = 1536 * 1536,
|
|
98
|
+
):
|
|
99
|
+
if height < factor:
|
|
100
|
+
width = round((width * factor) / height)
|
|
101
|
+
height = factor
|
|
102
|
+
|
|
103
|
+
if width < factor:
|
|
104
|
+
height = round((height * factor) / width)
|
|
105
|
+
width = factor
|
|
106
|
+
|
|
107
|
+
if max(height, width) / min(height, width) > 200:
|
|
108
|
+
raise ValueError(
|
|
109
|
+
f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
|
|
110
|
+
)
|
|
111
|
+
h_bar = round(height / factor) * factor
|
|
112
|
+
w_bar = round(width / factor) * factor
|
|
113
|
+
if h_bar * w_bar > max_pixels:
|
|
114
|
+
beta = math.sqrt((height * width) / max_pixels)
|
|
115
|
+
h_bar = math.floor(height / beta / factor) * factor
|
|
116
|
+
w_bar = math.floor(width / beta / factor) * factor
|
|
117
|
+
elif h_bar * w_bar < min_pixels:
|
|
118
|
+
beta = math.sqrt(min_pixels / (height * width))
|
|
119
|
+
h_bar = math.ceil(height * beta / factor) * factor
|
|
120
|
+
w_bar = math.ceil(width * beta / factor) * factor
|
|
121
|
+
return h_bar, w_bar
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
class PaddleOCRVLImageProcessor(Qwen2VLImageProcessor):
|
|
125
|
+
r"""
|
|
126
|
+
Constructs a PaddleOCRVL image processor that dynamically resizes images based on the original images.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
do_resize (`bool`, *optional*, defaults to `True`):
|
|
130
|
+
Whether to resize the image's (height, width) dimensions.
|
|
131
|
+
size (`dict[str, int]`, *optional*):
|
|
132
|
+
Size of the image after resizing. `shortest_edge` and `longest_edge` keys must be present.
|
|
133
|
+
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
|
|
134
|
+
Resampling filter to use when resizing the image.
|
|
135
|
+
do_rescale (`bool`, *optional*, defaults to `True`):
|
|
136
|
+
Whether to rescale the image by the specified scale `rescale_factor`.
|
|
137
|
+
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
|
|
138
|
+
Scale factor to use if rescaling the image.
|
|
139
|
+
do_normalize (`bool`, *optional*, defaults to `True`):
|
|
140
|
+
Whether to normalize the image.
|
|
141
|
+
image_mean (`float` or `list[float]`, *optional*):
|
|
142
|
+
Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
|
|
143
|
+
image_std (`float` or `list[float]`, *optional*):
|
|
144
|
+
Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image.
|
|
145
|
+
do_convert_rgb (`bool`, *optional*, defaults to `True`):
|
|
146
|
+
Whether to convert the image to RGB.
|
|
147
|
+
min_pixels (`int`, *optional*, defaults to `384 * 384`):
|
|
148
|
+
The min pixels of the image to resize the image.
|
|
149
|
+
max_pixels (`int`, *optional*, defaults to `1536 * 1536`):
|
|
150
|
+
The max pixels of the image to resize the image.
|
|
151
|
+
patch_size (`int`, *optional*, defaults to 14):
|
|
152
|
+
The spatial patch size of the vision encoder.
|
|
153
|
+
temporal_patch_size (`int`, *optional*, defaults to 1):
|
|
154
|
+
The temporal patch size of the vision encoder.
|
|
155
|
+
merge_size (`int`, *optional*, defaults to 2):
|
|
156
|
+
The merge size of the vision encoder to llm encoder.
|
|
157
|
+
"""
|
|
158
|
+
|
|
159
|
+
model_input_names = [
|
|
160
|
+
"pixel_values",
|
|
161
|
+
"image_grid_thw",
|
|
162
|
+
]
|
|
163
|
+
|
|
164
|
+
def __init__(
|
|
165
|
+
self,
|
|
166
|
+
do_resize: bool = True,
|
|
167
|
+
size: Optional[dict[str, int]] = None,
|
|
168
|
+
resample: PILImageResampling = PILImageResampling.BICUBIC,
|
|
169
|
+
do_rescale: bool = True,
|
|
170
|
+
rescale_factor: Union[int, float] = 1 / 255,
|
|
171
|
+
do_normalize: bool = True,
|
|
172
|
+
image_mean: Optional[Union[float, list[float]]] = None,
|
|
173
|
+
image_std: Optional[Union[float, list[float]]] = None,
|
|
174
|
+
do_convert_rgb: bool = True,
|
|
175
|
+
min_pixels: int = 384 * 384,
|
|
176
|
+
max_pixels: int = 1536 * 1536,
|
|
177
|
+
patch_size: int = 14,
|
|
178
|
+
temporal_patch_size: int = 1,
|
|
179
|
+
merge_size: int = 2,
|
|
180
|
+
**kwargs,
|
|
181
|
+
) -> None:
|
|
182
|
+
super().__init__()
|
|
183
|
+
|
|
184
|
+
def _preprocess(
|
|
185
|
+
self,
|
|
186
|
+
images: ImageInput,
|
|
187
|
+
do_resize: Optional[bool] = None,
|
|
188
|
+
size: Optional[dict[str, int]] = None,
|
|
189
|
+
resample: PILImageResampling = None,
|
|
190
|
+
do_rescale: Optional[bool] = None,
|
|
191
|
+
rescale_factor: Optional[float] = None,
|
|
192
|
+
do_normalize: Optional[bool] = None,
|
|
193
|
+
image_mean: Optional[Union[float, list[float]]] = None,
|
|
194
|
+
image_std: Optional[Union[float, list[float]]] = None,
|
|
195
|
+
patch_size: Optional[int] = None,
|
|
196
|
+
temporal_patch_size: Optional[int] = None,
|
|
197
|
+
merge_size: Optional[int] = None,
|
|
198
|
+
do_convert_rgb: Optional[bool] = None,
|
|
199
|
+
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
|
|
200
|
+
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
|
201
|
+
):
|
|
202
|
+
"""
|
|
203
|
+
Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
|
|
204
|
+
Args:
|
|
205
|
+
images (`ImageInput`):
|
|
206
|
+
Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
|
|
207
|
+
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
|
|
208
|
+
Whether to resize the image.
|
|
209
|
+
size (`Dict[str, int]`, *optional*, defaults to `self.size`):
|
|
210
|
+
Size of the image after resizing. `shortest_edge` and `longest_edge` keys must be present.
|
|
211
|
+
resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
|
|
212
|
+
Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
|
|
213
|
+
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
|
|
214
|
+
Whether to rescale the image.
|
|
215
|
+
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
|
|
216
|
+
Scale factor to use if rescaling the image.
|
|
217
|
+
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
|
|
218
|
+
Whether to normalize the image.
|
|
219
|
+
image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
|
|
220
|
+
Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
|
|
221
|
+
image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
|
|
222
|
+
Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
|
|
223
|
+
patch_size (`int`, *optional*, defaults to `self.patch_size`):
|
|
224
|
+
The spatial patch size of the vision encoder.
|
|
225
|
+
temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`):
|
|
226
|
+
The temporal patch size of the vision encoder.
|
|
227
|
+
merge_size (`int`, *optional*, defaults to `self.merge_size`):
|
|
228
|
+
The merge size of the vision encoder to llm encoder.
|
|
229
|
+
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
|
|
230
|
+
Whether to convert the image to RGB.
|
|
231
|
+
data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
|
|
232
|
+
The channel dimension format for the output image. Can be one of:
|
|
233
|
+
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
|
234
|
+
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
|
235
|
+
- Unset: Use the channel dimension format of the input image.
|
|
236
|
+
input_data_format (`ChannelDimension` or `str`, *optional*):
|
|
237
|
+
The channel dimension format for the input image. Can be one of:
|
|
238
|
+
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
|
239
|
+
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
|
240
|
+
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
|
|
241
|
+
"""
|
|
242
|
+
images = make_list_of_images(images)
|
|
243
|
+
images = self.fetch_images(images)
|
|
244
|
+
|
|
245
|
+
if do_convert_rgb:
|
|
246
|
+
images = [convert_to_rgb(image) for image in images]
|
|
247
|
+
|
|
248
|
+
# All transformations expect numpy arrays.
|
|
249
|
+
images = [to_numpy_array(image) for image in images]
|
|
250
|
+
|
|
251
|
+
if is_scaled_image(images[0]) and do_rescale:
|
|
252
|
+
logger.warning_once(
|
|
253
|
+
"It looks like you are trying to rescale already rescaled images. If the input"
|
|
254
|
+
" images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
|
|
255
|
+
)
|
|
256
|
+
if input_data_format is None:
|
|
257
|
+
# We assume that all images have the same channel dimension format.
|
|
258
|
+
input_data_format = infer_channel_dimension_format(images[0])
|
|
259
|
+
|
|
260
|
+
height, width = get_image_size(images[0], channel_dim=input_data_format)
|
|
261
|
+
resized_height, resized_width = height, width
|
|
262
|
+
processed_images = []
|
|
263
|
+
|
|
264
|
+
for image in images:
|
|
265
|
+
if do_resize:
|
|
266
|
+
resized_height, resized_width = smart_resize(
|
|
267
|
+
height,
|
|
268
|
+
width,
|
|
269
|
+
factor=patch_size * merge_size,
|
|
270
|
+
min_pixels=size["shortest_edge"],
|
|
271
|
+
max_pixels=size["longest_edge"],
|
|
272
|
+
)
|
|
273
|
+
image = resize(
|
|
274
|
+
image,
|
|
275
|
+
size=(resized_height, resized_width),
|
|
276
|
+
resample=resample,
|
|
277
|
+
input_data_format=input_data_format,
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
if do_rescale:
|
|
281
|
+
image = self.rescale(image, scale=rescale_factor, input_data_format=input_data_format)
|
|
282
|
+
|
|
283
|
+
if do_normalize:
|
|
284
|
+
image = self.normalize(
|
|
285
|
+
image=image,
|
|
286
|
+
mean=image_mean,
|
|
287
|
+
std=image_std,
|
|
288
|
+
input_data_format=input_data_format,
|
|
289
|
+
)
|
|
290
|
+
image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
|
|
291
|
+
processed_images.append(image)
|
|
292
|
+
|
|
293
|
+
patches = np.array(processed_images)
|
|
294
|
+
if data_format == ChannelDimension.LAST:
|
|
295
|
+
patches = patches.transpose(0, 3, 1, 2)
|
|
296
|
+
if patches.shape[0] == 1:
|
|
297
|
+
patches = np.tile(patches, (temporal_patch_size, 1, 1, 1))
|
|
298
|
+
|
|
299
|
+
channel = patches.shape[1]
|
|
300
|
+
grid_t = patches.shape[0] // temporal_patch_size
|
|
301
|
+
grid_h, grid_w = (
|
|
302
|
+
resized_height // patch_size,
|
|
303
|
+
resized_width // patch_size,
|
|
304
|
+
)
|
|
305
|
+
patches = patches.reshape(
|
|
306
|
+
grid_t,
|
|
307
|
+
temporal_patch_size,
|
|
308
|
+
channel,
|
|
309
|
+
grid_h,
|
|
310
|
+
patch_size,
|
|
311
|
+
grid_w,
|
|
312
|
+
patch_size,
|
|
313
|
+
)
|
|
314
|
+
patches = patches.transpose(0, 3, 5, 2, 1, 4, 6)
|
|
315
|
+
if temporal_patch_size != 1:
|
|
316
|
+
raise ValueError(f"temporal_patch_size must be 1!, but got {temporal_patch_size}!")
|
|
317
|
+
flatten_patches = patches.reshape(grid_t * grid_h * grid_w, channel, patch_size, patch_size)
|
|
318
|
+
return flatten_patches, (grid_t, grid_h, grid_w)
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
class PaddleOCRVLImageProcessorFast(BaseImageProcessorFast):
|
|
322
|
+
def __init__(
|
|
323
|
+
self,
|
|
324
|
+
do_resize: bool = True,
|
|
325
|
+
size: Optional[dict[str, int]] = None,
|
|
326
|
+
resample: PILImageResampling = PILImageResampling.BICUBIC,
|
|
327
|
+
do_rescale: bool = True,
|
|
328
|
+
rescale_factor: Union[int, float] = 1 / 255,
|
|
329
|
+
do_normalize: bool = True,
|
|
330
|
+
image_mean: Optional[Union[float, list[float]]] = None,
|
|
331
|
+
image_std: Optional[Union[float, list[float]]] = None,
|
|
332
|
+
do_convert_rgb: bool = True,
|
|
333
|
+
min_pixels: int = 384 * 384,
|
|
334
|
+
max_pixels: int = 1536 * 1536,
|
|
335
|
+
patch_size: int = 14,
|
|
336
|
+
temporal_patch_size: int = 1,
|
|
337
|
+
merge_size: int = 2,
|
|
338
|
+
**kwargs,
|
|
339
|
+
) -> None:
|
|
340
|
+
super().__init__(**kwargs)
|
|
341
|
+
if size is not None and ("shortest_edge" not in size or "longest_edge" not in size):
|
|
342
|
+
raise ValueError("size must contain 'shortest_edge' and 'longest_edge' keys.")
|
|
343
|
+
else:
|
|
344
|
+
size = {"shortest_edge": 384 * 384, "longest_edge": 1536 * 1536}
|
|
345
|
+
# backward compatibility: override size with min_pixels and max_pixels if they are provided
|
|
346
|
+
if min_pixels is not None:
|
|
347
|
+
size["shortest_edge"] = min_pixels
|
|
348
|
+
if max_pixels is not None:
|
|
349
|
+
size["longest_edge"] = max_pixels
|
|
350
|
+
self.min_pixels = size["shortest_edge"]
|
|
351
|
+
self.max_pixels = size["longest_edge"]
|
|
352
|
+
self.size = size
|
|
353
|
+
|
|
354
|
+
self.do_resize = do_resize
|
|
355
|
+
self.resample = resample
|
|
356
|
+
self.do_rescale = do_rescale
|
|
357
|
+
self.rescale_factor = rescale_factor
|
|
358
|
+
self.do_normalize = do_normalize
|
|
359
|
+
self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
|
|
360
|
+
self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
|
|
361
|
+
|
|
362
|
+
self.patch_size = patch_size
|
|
363
|
+
self.temporal_patch_size = temporal_patch_size
|
|
364
|
+
self.merge_size = merge_size
|
|
365
|
+
self.do_convert_rgb = do_convert_rgb
|
|
366
|
+
|
|
367
|
+
def _preprocess(
|
|
368
|
+
self,
|
|
369
|
+
images: list["torch.Tensor"],
|
|
370
|
+
do_resize: bool,
|
|
371
|
+
size: SizeDict,
|
|
372
|
+
interpolation: Optional["F.InterpolationMode"],
|
|
373
|
+
do_rescale: bool,
|
|
374
|
+
rescale_factor: float,
|
|
375
|
+
do_normalize: bool,
|
|
376
|
+
image_mean: Optional[Union[float, list[float]]],
|
|
377
|
+
image_std: Optional[Union[float, list[float]]],
|
|
378
|
+
disable_grouping: Optional[bool],
|
|
379
|
+
return_tensors: Optional[Union[str, TensorType]],
|
|
380
|
+
patch_size: Optional[int] = None,
|
|
381
|
+
temporal_patch_size: Optional[int] = None,
|
|
382
|
+
merge_size: Optional[int] = None,
|
|
383
|
+
**kwargs,
|
|
384
|
+
):
|
|
385
|
+
patch_size = patch_size if patch_size is not None else self.patch_size
|
|
386
|
+
temporal_patch_size = temporal_patch_size if temporal_patch_size is not None else self.temporal_patch_size
|
|
387
|
+
merge_size = merge_size if merge_size is not None else self.merge_size
|
|
388
|
+
|
|
389
|
+
grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
|
|
390
|
+
resized_images_grouped = {}
|
|
391
|
+
for shape, stacked_images in grouped_images.items():
|
|
392
|
+
height, width = stacked_images.shape[-2:]
|
|
393
|
+
if do_resize:
|
|
394
|
+
resized_height, resized_width = smart_resize(
|
|
395
|
+
height,
|
|
396
|
+
width,
|
|
397
|
+
factor=patch_size * merge_size,
|
|
398
|
+
min_pixels=size["shortest_edge"],
|
|
399
|
+
max_pixels=size["longest_edge"],
|
|
400
|
+
)
|
|
401
|
+
stacked_images = self.resize(
|
|
402
|
+
image=stacked_images,
|
|
403
|
+
size=SizeDict(height=resized_height, width=resized_width),
|
|
404
|
+
interpolation=interpolation,
|
|
405
|
+
)
|
|
406
|
+
resized_images_grouped[shape] = stacked_images
|
|
407
|
+
resized_images = reorder_images(resized_images_grouped, grouped_images_index)
|
|
408
|
+
|
|
409
|
+
# Group images by size for further processing
|
|
410
|
+
# Needed in case do_resize is False, or resize returns images with different sizes
|
|
411
|
+
grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
|
|
412
|
+
processed_images_grouped = {}
|
|
413
|
+
processed_grids = {}
|
|
414
|
+
for shape, stacked_images in grouped_images.items():
|
|
415
|
+
resized_height, resized_width = stacked_images.shape[-2:]
|
|
416
|
+
# Fused rescale and normalize
|
|
417
|
+
patches = self.rescale_and_normalize(
|
|
418
|
+
stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
|
|
419
|
+
)
|
|
420
|
+
|
|
421
|
+
if patches.ndim == 4:
|
|
422
|
+
# add a temporal dimension if we have images
|
|
423
|
+
patches = patches.unsqueeze(1)
|
|
424
|
+
if patches.shape[1] % temporal_patch_size != 0:
|
|
425
|
+
repeats = patches[:, -1:].repeat(1, temporal_patch_size - 1, 1, 1, 1)
|
|
426
|
+
patches = torch.cat([patches, repeats], dim=1)
|
|
427
|
+
|
|
428
|
+
batch_size, grid_t, channel = patches.shape[:3]
|
|
429
|
+
grid_t = grid_t // temporal_patch_size
|
|
430
|
+
grid_h, grid_w = (
|
|
431
|
+
resized_height // patch_size,
|
|
432
|
+
resized_width // patch_size,
|
|
433
|
+
)
|
|
434
|
+
patches = patches.view(
|
|
435
|
+
batch_size,
|
|
436
|
+
grid_t,
|
|
437
|
+
temporal_patch_size,
|
|
438
|
+
channel,
|
|
439
|
+
grid_h,
|
|
440
|
+
patch_size,
|
|
441
|
+
grid_w,
|
|
442
|
+
patch_size,
|
|
443
|
+
)
|
|
444
|
+
patches = patches.permute(0, 1, 4, 6, 3, 2, 5, 7)
|
|
445
|
+
flatten_patches = patches.reshape(batch_size, grid_t * grid_h * grid_w, channel, patch_size, patch_size)
|
|
446
|
+
|
|
447
|
+
processed_images_grouped[shape] = flatten_patches
|
|
448
|
+
processed_grids[shape] = [[grid_t, grid_h, grid_w]] * batch_size
|
|
449
|
+
|
|
450
|
+
processed_images = reorder_images(processed_images_grouped, grouped_images_index)
|
|
451
|
+
processed_grids = reorder_images(processed_grids, grouped_images_index)
|
|
452
|
+
pixel_values = torch.cat(processed_images, dim=0)
|
|
453
|
+
image_grid_thw = torch.tensor(processed_grids)
|
|
454
|
+
|
|
455
|
+
return BatchFeature(
|
|
456
|
+
data={"pixel_values": pixel_values, "image_grid_thw": image_grid_thw}, tensor_type=return_tensors
|
|
457
|
+
)
|
|
458
|
+
|
|
459
|
+
|
|
460
|
+
class PaddleOCRVLProcessorKwargs(ProcessingKwargs, total=False):
|
|
461
|
+
_defaults = {
|
|
462
|
+
"text_kwargs": {
|
|
463
|
+
"padding": False,
|
|
464
|
+
},
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
class PaddleOCRVLProcessor(ProcessorMixin):
|
|
469
|
+
r"""
|
|
470
|
+
[`PaddleOCRVLProcessor`] offers all the functionalities of [`PaddleOCRVLImageProcessor`] and [`LLamaTokenizerFast`]. See the
|
|
471
|
+
[`~PaddleOCRVLProcessor.__call__`] and [`~PaddleOCRVLProcessor.decode`] for more information.
|
|
472
|
+
Args:
|
|
473
|
+
image_processor ([`PaddleOCRVLImageProcessor`], *optional*):
|
|
474
|
+
The image processor is a required input.
|
|
475
|
+
tokenizer ([`LLamaTokenizerFast`], *optional*):
|
|
476
|
+
The tokenizer is a required input.
|
|
477
|
+
chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
|
|
478
|
+
in a chat into a tokenizable string.
|
|
479
|
+
"""
|
|
480
|
+
|
|
481
|
+
image_processor_class = "AutoImageProcessor"
|
|
482
|
+
tokenizer_class = "AutoTokenizer"
|
|
483
|
+
|
|
484
|
+
def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
|
|
485
|
+
self.image_token = tokenizer.image_token
|
|
486
|
+
super().__init__(image_processor, tokenizer, chat_template=chat_template)
|
|
487
|
+
|
|
488
|
+
def __call__(
|
|
489
|
+
self,
|
|
490
|
+
images: ImageInput = None,
|
|
491
|
+
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
|
|
492
|
+
**kwargs: Unpack[PaddleOCRVLProcessorKwargs],
|
|
493
|
+
) -> BatchFeature:
|
|
494
|
+
"""
|
|
495
|
+
Args:
|
|
496
|
+
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
|
|
497
|
+
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
|
|
498
|
+
tensor. Both channels-first and channels-last formats are supported.
|
|
499
|
+
text (`str`, `List[str]`, `List[List[str]]`):
|
|
500
|
+
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
|
|
501
|
+
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
|
|
502
|
+
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
|
503
|
+
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
|
504
|
+
If set, will return tensors of a particular framework. Acceptable values are:
|
|
505
|
+
- `'tf'`: Return TensorFlow `tf.constant` objects.
|
|
506
|
+
- `'pt'`: Return PyTorch `torch.Tensor` objects.
|
|
507
|
+
- `'np'`: Return NumPy `np.ndarray` objects.
|
|
508
|
+
- `'jax'`: Return JAX `jnp.ndarray` objects.
|
|
509
|
+
|
|
510
|
+
Returns:
|
|
511
|
+
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
|
|
512
|
+
|
|
513
|
+
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
|
|
514
|
+
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
|
|
515
|
+
`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
|
|
516
|
+
`None`).
|
|
517
|
+
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
|
|
518
|
+
- **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
|
|
519
|
+
"""
|
|
520
|
+
output_kwargs = self._merge_kwargs(
|
|
521
|
+
PaddleOCRVLProcessorKwargs,
|
|
522
|
+
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
|
523
|
+
**kwargs,
|
|
524
|
+
)
|
|
525
|
+
|
|
526
|
+
if images is not None:
|
|
527
|
+
image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
|
|
528
|
+
image_grid_thw = image_inputs["image_grid_thw"]
|
|
529
|
+
|
|
530
|
+
else:
|
|
531
|
+
image_inputs = {}
|
|
532
|
+
image_grid_thw = None
|
|
533
|
+
|
|
534
|
+
if not isinstance(text, list):
|
|
535
|
+
text = [text]
|
|
536
|
+
|
|
537
|
+
text = text.copy()
|
|
538
|
+
|
|
539
|
+
if image_grid_thw is not None:
|
|
540
|
+
index = 0
|
|
541
|
+
for i in range(len(text)):
|
|
542
|
+
while self.image_token in text[i]:
|
|
543
|
+
text[i] = text[i].replace(
|
|
544
|
+
self.image_token,
|
|
545
|
+
"<|placeholder|>"
|
|
546
|
+
* (
|
|
547
|
+
image_grid_thw[index].prod()
|
|
548
|
+
// self.image_processor.merge_size
|
|
549
|
+
// self.image_processor.merge_size
|
|
550
|
+
),
|
|
551
|
+
1,
|
|
552
|
+
)
|
|
553
|
+
index += 1
|
|
554
|
+
text[i] = text[i].replace("<|placeholder|>", self.image_token)
|
|
555
|
+
|
|
556
|
+
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
|
557
|
+
|
|
558
|
+
return BatchFeature(data={**text_inputs, **image_inputs})
|
|
559
|
+
|
|
560
|
+
|
|
561
|
+
class PaddleOCRVisionConfig(SiglipVisionConfig):
|
|
562
|
+
r"""
|
|
563
|
+
This is the configuration class to store the configuration of a [`PaddleOCRVisionModel`]. It is used to instantiate a
|
|
564
|
+
PaddleOCRVL vision encoder according to the specified arguments, defining the model architecture. Instantiating a
|
|
565
|
+
configuration with the defaults will yield a similar configuration to that of the vision encoder of the PaddleOCRVL
|
|
566
|
+
[PaddlePaddle/PaddleOCRVL](https://huggingface.co/PaddlePaddle/PaddleOCR-VL) architecture.
|
|
567
|
+
|
|
568
|
+
Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
|
|
569
|
+
documentation from [`PreTrainedConfig`] for more information.
|
|
570
|
+
|
|
571
|
+
Args:
|
|
572
|
+
hidden_size (`int`, *optional*, defaults to 1152):
|
|
573
|
+
Dimensionality of the encoder layers and the pooler layer.
|
|
574
|
+
intermediate_size (`int`, *optional*, defaults to 4304):
|
|
575
|
+
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
|
576
|
+
num_hidden_layers (`int`, *optional*, defaults to 27):
|
|
577
|
+
Number of hidden layers in the Transformer encoder.
|
|
578
|
+
num_attention_heads (`int`, *optional*, defaults to 16):
|
|
579
|
+
Number of attention heads for each attention layer in the Transformer encoder.
|
|
580
|
+
num_channels (`int`, *optional*, defaults to 3):
|
|
581
|
+
Number of channels in the input images.
|
|
582
|
+
image_size (`int`, *optional*, defaults to 384):
|
|
583
|
+
The size (resolution) of each image.
|
|
584
|
+
patch_size (`int`, *optional*, defaults to 14):
|
|
585
|
+
The size (resolution) of each patch.
|
|
586
|
+
hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
|
|
587
|
+
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
|
588
|
+
`"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
|
|
589
|
+
layer_norm_eps (`float`, *optional*, defaults to 1e-06):
|
|
590
|
+
The epsilon used by the layer normalization layers.
|
|
591
|
+
attention_dropout (`float`, *optional*, defaults to 0.0):
|
|
592
|
+
The dropout ratio for the attention probabilities.
|
|
593
|
+
spatial_merge_size (`int`, *optional*, defaults to 2):
|
|
594
|
+
The size used for merging spatial dimensions.
|
|
595
|
+
|
|
596
|
+
Example:
|
|
597
|
+
|
|
598
|
+
```python
|
|
599
|
+
>>> from transformers import PaddleOCRVisionConfig, PaddleOCRVisionModel
|
|
600
|
+
|
|
601
|
+
>>> # Initializing a PaddleOCRVisionConfig with PaddlePaddle/PaddleOCR-VL style configuration
|
|
602
|
+
>>> configuration = PaddleOCRVisionConfig()
|
|
603
|
+
|
|
604
|
+
>>> # Initializing a PaddleOCRVisionModel (with random weights) from the PaddlePaddle/PaddleOCR-VL style configuration
|
|
605
|
+
>>> model = PaddleOCRVisionModel(configuration)
|
|
606
|
+
|
|
607
|
+
>>> # Accessing the model configuration
|
|
608
|
+
>>> configuration = model.config
|
|
609
|
+
```
|
|
610
|
+
"""
|
|
611
|
+
|
|
612
|
+
model_type = "paddleocr_vl_vision"
|
|
613
|
+
base_config_key = "vision_config"
|
|
614
|
+
|
|
615
|
+
def __init__(
|
|
616
|
+
self,
|
|
617
|
+
hidden_size=1152,
|
|
618
|
+
intermediate_size=4304,
|
|
619
|
+
num_hidden_layers=27,
|
|
620
|
+
num_attention_heads=16,
|
|
621
|
+
num_channels=3,
|
|
622
|
+
image_size=384,
|
|
623
|
+
patch_size=14,
|
|
624
|
+
hidden_act="gelu_pytorch_tanh",
|
|
625
|
+
layer_norm_eps=1e-6,
|
|
626
|
+
attention_dropout=0.0,
|
|
627
|
+
spatial_merge_size=2,
|
|
628
|
+
**kwargs,
|
|
629
|
+
):
|
|
630
|
+
super().__init__()
|
|
631
|
+
self.spatial_merge_size = spatial_merge_size
|
|
632
|
+
|
|
633
|
+
|
|
634
|
+
class PaddleOCRTextConfig(Ernie4_5Config):
|
|
635
|
+
model_type = "paddleocr_vl_text"
|
|
636
|
+
|
|
637
|
+
|
|
638
|
+
class PaddleOCRVLConfig(Qwen2VLConfig):
|
|
639
|
+
r"""
|
|
640
|
+
This is the configuration class to store the configuration of a [`PaddleOCRVLForConditionalGeneration`]. It is used to instantiate a
|
|
641
|
+
PaddleOCRVL model according to the specified arguments, defining the model architecture. Instantiating a configuration
|
|
642
|
+
with the defaults will yield a similar configuration to that of
|
|
643
|
+
PaddleOCRVL [PaddlePaddle/PaddleOCR-VL](https://huggingface.co/PaddlePaddle/PaddleOCR-VL).
|
|
644
|
+
|
|
645
|
+
Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
|
|
646
|
+
documentation from [`PreTrainedConfig`] for more information.
|
|
647
|
+
|
|
648
|
+
|
|
649
|
+
Args:
|
|
650
|
+
text_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `PaddleOCRTextConfig`):
|
|
651
|
+
The config object or dictionary of the text backbone.
|
|
652
|
+
vision_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `PaddleOCRVisionConfig`):
|
|
653
|
+
The config object or dictionary of the vision backbone.
|
|
654
|
+
image_token_id (`int`, *optional*, defaults to 100295):
|
|
655
|
+
The image token index to encode the image prompt.
|
|
656
|
+
video_token_id (`int`, *optional*, defaults to 100296):
|
|
657
|
+
The video token index to encode the image prompt.
|
|
658
|
+
vision_start_token_id (`int`, *optional*, defaults to 101305):
|
|
659
|
+
The token index to denote start of vision input.
|
|
660
|
+
vision_end_token_id (`int`, *optional*, defaults to 101306):
|
|
661
|
+
The token index to denote end of vision input.
|
|
662
|
+
|
|
663
|
+
```python
|
|
664
|
+
>>> from transformers import PaddleOCRVLForConditionalGeneration, PaddleOCRVLConfig
|
|
665
|
+
|
|
666
|
+
>>> # Initializing a PaddleOCRVL style configuration
|
|
667
|
+
>>> configuration = PaddleOCRVLConfig()
|
|
668
|
+
|
|
669
|
+
>>> # Initializing a model from the PaddleOCRVL style configuration
|
|
670
|
+
>>> model = PaddleOCRVLForConditionalGeneration(configuration)
|
|
671
|
+
|
|
672
|
+
>>> # Accessing the model configuration
|
|
673
|
+
>>> configuration = model.config
|
|
674
|
+
```"""
|
|
675
|
+
|
|
676
|
+
sub_configs = {"vision_config": PaddleOCRVisionConfig, "text_config": PaddleOCRTextConfig}
|
|
677
|
+
|
|
678
|
+
def __init__(
|
|
679
|
+
self,
|
|
680
|
+
text_config=None,
|
|
681
|
+
vision_config=None,
|
|
682
|
+
image_token_id=100295,
|
|
683
|
+
video_token_id=100296,
|
|
684
|
+
vision_start_token_id=101305,
|
|
685
|
+
vision_end_token_id=101306,
|
|
686
|
+
**kwargs,
|
|
687
|
+
):
|
|
688
|
+
super().__init__()
|
|
689
|
+
|
|
690
|
+
|
|
691
|
+
class PaddleOCRProjector(nn.Module):
|
|
692
|
+
def __init__(self, config: PaddleOCRVLConfig):
|
|
693
|
+
super().__init__()
|
|
694
|
+
self.merge_kernel_size = (config.vision_config.spatial_merge_size, config.vision_config.spatial_merge_size)
|
|
695
|
+
|
|
696
|
+
hidden_size = config.vision_config.hidden_size * self.merge_kernel_size[0] * self.merge_kernel_size[1]
|
|
697
|
+
|
|
698
|
+
self.pre_norm = torch.nn.LayerNorm(config.vision_config.hidden_size, eps=1e-05)
|
|
699
|
+
self.linear_1 = nn.Linear(hidden_size, hidden_size, bias=True)
|
|
700
|
+
self.act = GELUActivation()
|
|
701
|
+
self.linear_2 = nn.Linear(hidden_size, config.text_config.hidden_size, bias=True)
|
|
702
|
+
|
|
703
|
+
def forward(self, image_features: torch.Tensor, image_grid_thw: torch.Tensor) -> torch.Tensor:
|
|
704
|
+
image_features_chunks = image_features.split(image_grid_thw.prod(dim=1).tolist(), dim=0)
|
|
705
|
+
m1, m2 = self.merge_kernel_size
|
|
706
|
+
|
|
707
|
+
processed_features = []
|
|
708
|
+
for image_feature, image_grid in zip(image_features_chunks, image_grid_thw):
|
|
709
|
+
image_feature = self.pre_norm(image_feature)
|
|
710
|
+
t, h, w = image_grid
|
|
711
|
+
d = image_feature.shape[-1]
|
|
712
|
+
h_block = h // m1
|
|
713
|
+
w_block = w // m2
|
|
714
|
+
|
|
715
|
+
image_feature = image_feature.reshape(t, h_block, m1, w_block, m2, d)
|
|
716
|
+
image_feature = image_feature.transpose(2, 3)
|
|
717
|
+
image_feature = image_feature.reshape(t * h_block * w_block, m1 * m2 * d)
|
|
718
|
+
|
|
719
|
+
hidden_states = self.linear_1(image_feature)
|
|
720
|
+
hidden_states = self.act(hidden_states)
|
|
721
|
+
hidden_states = self.linear_2(hidden_states)
|
|
722
|
+
processed_features.append(hidden_states)
|
|
723
|
+
|
|
724
|
+
return torch.cat(processed_features, dim=0)
|
|
725
|
+
|
|
726
|
+
|
|
727
|
+
class PaddleOCRVisionRotaryEmbedding(VisionRotaryEmbedding):
|
|
728
|
+
pass
|
|
729
|
+
|
|
730
|
+
|
|
731
|
+
class PaddleOCRRotaryEmbedding(Qwen2VLRotaryEmbedding):
|
|
732
|
+
pass
|
|
733
|
+
|
|
734
|
+
|
|
735
|
+
class PaddleOCRMLP(Ernie4_5MLP):
|
|
736
|
+
def __init__(self, config: PaddleOCRTextConfig):
|
|
737
|
+
super().__init__()
|
|
738
|
+
|
|
739
|
+
|
|
740
|
+
class PaddleOCRAttention(Qwen2_5OmniAttention):
|
|
741
|
+
def __init__(self, config: PaddleOCRVLConfig, layer_idx: Optional[int] = None):
|
|
742
|
+
super().__init__()
|
|
743
|
+
|
|
744
|
+
self.attention_dropout = 0.0
|
|
745
|
+
self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.use_bias)
|
|
746
|
+
self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.use_bias)
|
|
747
|
+
self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.use_bias)
|
|
748
|
+
self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.use_bias)
|
|
749
|
+
|
|
750
|
+
|
|
751
|
+
class PaddleOCRRMSNorm(Ernie4_5RMSNorm):
|
|
752
|
+
pass
|
|
753
|
+
|
|
754
|
+
|
|
755
|
+
class PaddleOCRDecoderLayer(Ernie4_5DecoderLayer):
|
|
756
|
+
def __init__(self, config: PaddleOCRTextConfig, layer_idx: int):
|
|
757
|
+
super().__init__()
|
|
758
|
+
|
|
759
|
+
|
|
760
|
+
@auto_docstring
|
|
761
|
+
class PaddleOCRVLPreTrainedModel(PreTrainedModel):
|
|
762
|
+
config: PaddleOCRVLConfig
|
|
763
|
+
base_model_prefix = "model"
|
|
764
|
+
supports_gradient_checkpointing = True
|
|
765
|
+
_no_split_modules = ["PaddleOCRDecoderLayer"]
|
|
766
|
+
_skip_keys_device_placement = ["past_key_values"]
|
|
767
|
+
_supports_flash_attn = True
|
|
768
|
+
_supports_sdpa = True
|
|
769
|
+
_supports_flex_attn = True
|
|
770
|
+
|
|
771
|
+
_can_compile_fullgraph = True
|
|
772
|
+
_supports_attention_backend = True
|
|
773
|
+
|
|
774
|
+
_can_record_outputs = {
|
|
775
|
+
"hidden_states": PaddleOCRDecoderLayer,
|
|
776
|
+
"attentions": PaddleOCRAttention,
|
|
777
|
+
}
|
|
778
|
+
|
|
779
|
+
|
|
780
|
+
class PaddleOCRTextModel(PaddleOCRVLPreTrainedModel, Ernie4_5Model):
|
|
781
|
+
def __init__(self, config: PaddleOCRTextConfig):
|
|
782
|
+
super().__init__(config)
|
|
783
|
+
|
|
784
|
+
@check_model_inputs
|
|
785
|
+
@auto_docstring
|
|
786
|
+
def forward(
|
|
787
|
+
self,
|
|
788
|
+
input_ids: Optional[torch.LongTensor] = None,
|
|
789
|
+
attention_mask: Optional[torch.Tensor] = None,
|
|
790
|
+
position_ids: Optional[torch.LongTensor] = None,
|
|
791
|
+
past_key_values: Optional[Cache] = None,
|
|
792
|
+
inputs_embeds: Optional[torch.FloatTensor] = None,
|
|
793
|
+
cache_position: Optional[torch.LongTensor] = None,
|
|
794
|
+
use_cache: Optional[bool] = None,
|
|
795
|
+
**kwargs: Unpack[TransformersKwargs],
|
|
796
|
+
) -> BaseModelOutputWithPast:
|
|
797
|
+
if (input_ids is None) ^ (inputs_embeds is not None):
|
|
798
|
+
raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
|
|
799
|
+
|
|
800
|
+
if inputs_embeds is None:
|
|
801
|
+
inputs_embeds: torch.Tensor = self.embed_tokens(input_ids)
|
|
802
|
+
|
|
803
|
+
if use_cache and past_key_values is None:
|
|
804
|
+
past_key_values = DynamicCache(config=self.config)
|
|
805
|
+
|
|
806
|
+
if cache_position is None:
|
|
807
|
+
past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
|
|
808
|
+
cache_position: torch.Tensor = (
|
|
809
|
+
torch.arange(inputs_embeds.shape[1], device=inputs_embeds.device) + past_seen_tokens
|
|
810
|
+
)
|
|
811
|
+
|
|
812
|
+
if position_ids is None:
|
|
813
|
+
position_ids = cache_position.view(1, 1, -1).expand(3, inputs_embeds.shape[0], -1)
|
|
814
|
+
elif position_ids.ndim == 2:
|
|
815
|
+
position_ids = position_ids[None, ...].expand(3, position_ids.shape[0], -1)
|
|
816
|
+
|
|
817
|
+
if position_ids.ndim == 3 and position_ids.shape[0] == 4:
|
|
818
|
+
text_position_ids = position_ids[0]
|
|
819
|
+
position_ids = position_ids[1:]
|
|
820
|
+
else:
|
|
821
|
+
text_position_ids = None
|
|
822
|
+
|
|
823
|
+
causal_mask = create_causal_mask(
|
|
824
|
+
config=self.config,
|
|
825
|
+
input_embeds=inputs_embeds,
|
|
826
|
+
attention_mask=attention_mask,
|
|
827
|
+
cache_position=cache_position,
|
|
828
|
+
past_key_values=past_key_values,
|
|
829
|
+
position_ids=text_position_ids,
|
|
830
|
+
)
|
|
831
|
+
|
|
832
|
+
hidden_states = inputs_embeds
|
|
833
|
+
position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids)
|
|
834
|
+
|
|
835
|
+
for decoder_layer in self.layers[: self.config.num_hidden_layers]:
|
|
836
|
+
hidden_states = decoder_layer(
|
|
837
|
+
hidden_states,
|
|
838
|
+
attention_mask=causal_mask,
|
|
839
|
+
position_embeddings=position_embeddings,
|
|
840
|
+
position_ids=text_position_ids,
|
|
841
|
+
past_key_values=past_key_values,
|
|
842
|
+
use_cache=use_cache,
|
|
843
|
+
cache_position=cache_position,
|
|
844
|
+
**kwargs,
|
|
845
|
+
)
|
|
846
|
+
|
|
847
|
+
hidden_states = self.norm(hidden_states)
|
|
848
|
+
return BaseModelOutputWithPast(
|
|
849
|
+
last_hidden_state=hidden_states,
|
|
850
|
+
past_key_values=past_key_values,
|
|
851
|
+
)
|
|
852
|
+
|
|
853
|
+
|
|
854
|
+
class PaddleOCRVisionModel(PaddleOCRVLPreTrainedModel):
|
|
855
|
+
config: PaddleOCRVisionConfig
|
|
856
|
+
main_input_name = "pixel_values"
|
|
857
|
+
input_modalities = "image"
|
|
858
|
+
|
|
859
|
+
def __init__(self, config: PaddleOCRVisionConfig):
|
|
860
|
+
super().__init__(config)
|
|
861
|
+
|
|
862
|
+
self.vision_model = PaddleOCRVisionTransformer(config)
|
|
863
|
+
|
|
864
|
+
# Initialize weights and apply final processing
|
|
865
|
+
self.post_init()
|
|
866
|
+
|
|
867
|
+
def forward(
|
|
868
|
+
self,
|
|
869
|
+
pixel_values: torch.FloatTensor,
|
|
870
|
+
cu_seqlens: torch.Tensor,
|
|
871
|
+
image_grid_thw: Optional[list[Union[tuple[int, int, int], list[tuple[int, int, int]]]]] = None,
|
|
872
|
+
**kwargs,
|
|
873
|
+
) -> BaseModelOutputWithPooling:
|
|
874
|
+
"""
|
|
875
|
+
Args:
|
|
876
|
+
pixel_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, image_channels, patch_size, patch_size)`):
|
|
877
|
+
The tensors corresponding to the input images.
|
|
878
|
+
cu_seqlens (`torch.Tensor` of shape `(num_images + 1,)`):
|
|
879
|
+
The cumulative sequence lengths of each image or video feature.
|
|
880
|
+
image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
|
|
881
|
+
The temporal, height and width of feature shape of each image in LLM.
|
|
882
|
+
"""
|
|
883
|
+
return self.vision_model(
|
|
884
|
+
pixel_values=pixel_values,
|
|
885
|
+
cu_seqlens=cu_seqlens,
|
|
886
|
+
image_grid_thw=image_grid_thw,
|
|
887
|
+
)
|
|
888
|
+
|
|
889
|
+
|
|
890
|
+
class PaddleOCRVisionEmbeddings(SiglipVisionEmbeddings):
|
|
891
|
+
def __init__(self, config: PaddleOCRVisionConfig):
|
|
892
|
+
super().__init__()
|
|
893
|
+
|
|
894
|
+
def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
|
|
895
|
+
num_positions = self.position_embedding.weight.shape[0]
|
|
896
|
+
|
|
897
|
+
patch_pos_embed = self.position_embedding.weight.unsqueeze(0)
|
|
898
|
+
|
|
899
|
+
dim = embeddings.shape[-1]
|
|
900
|
+
|
|
901
|
+
sqrt_num_positions = torch_int(num_positions**0.5)
|
|
902
|
+
patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
|
|
903
|
+
patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
|
|
904
|
+
|
|
905
|
+
patch_pos_embed = nn.functional.interpolate(
|
|
906
|
+
patch_pos_embed,
|
|
907
|
+
size=(height, width),
|
|
908
|
+
mode="bilinear",
|
|
909
|
+
align_corners=False,
|
|
910
|
+
)
|
|
911
|
+
|
|
912
|
+
patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
|
|
913
|
+
return patch_pos_embed
|
|
914
|
+
|
|
915
|
+
def forward(
|
|
916
|
+
self,
|
|
917
|
+
pixel_values: torch.FloatTensor,
|
|
918
|
+
image_grid_thw: Optional[list[Union[tuple[int, int, int], list[tuple[int, int, int]]]]] = None,
|
|
919
|
+
) -> torch.Tensor:
|
|
920
|
+
"""
|
|
921
|
+
Args:
|
|
922
|
+
pixel_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, image_channels, patch_size, patch_size)`):
|
|
923
|
+
The tensors corresponding to the input images.
|
|
924
|
+
image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
|
|
925
|
+
The temporal, height and width of feature shape of each image in LLM.
|
|
926
|
+
"""
|
|
927
|
+
batch_size, squence_len, channel, height, width = pixel_values.shape
|
|
928
|
+
target_dtype = self.patch_embedding.weight.dtype
|
|
929
|
+
pixel_values = pixel_values.reshape(batch_size * squence_len, channel, height, width)
|
|
930
|
+
patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid]
|
|
931
|
+
embeddings = patch_embeds.flatten(-2).squeeze(-1)
|
|
932
|
+
embeddings = embeddings.reshape(batch_size, squence_len, -1)
|
|
933
|
+
|
|
934
|
+
start = 0
|
|
935
|
+
embeddings = embeddings.squeeze(0)
|
|
936
|
+
tmp_embeddings = []
|
|
937
|
+
for image_grid in image_grid_thw:
|
|
938
|
+
t, h, w = image_grid
|
|
939
|
+
end = start + t * h * w
|
|
940
|
+
image_embeddings = embeddings[start:end, :]
|
|
941
|
+
position_embedding = self.interpolate_pos_encoding(image_embeddings, h, w).squeeze(0).repeat(t, 1)
|
|
942
|
+
image_embeddings = image_embeddings + position_embedding
|
|
943
|
+
tmp_embeddings.append(image_embeddings)
|
|
944
|
+
start = end
|
|
945
|
+
embeddings = torch.concat(tmp_embeddings, dim=0)
|
|
946
|
+
|
|
947
|
+
return embeddings
|
|
948
|
+
|
|
949
|
+
|
|
950
|
+
class PaddleOCRVisionAttention(VideoLlama3VisionAttention):
|
|
951
|
+
def __init__(self, config: PaddleOCRVisionConfig):
|
|
952
|
+
super().__init__()
|
|
953
|
+
|
|
954
|
+
|
|
955
|
+
class PaddleOCRVisionMLP(SiglipMLP):
|
|
956
|
+
def __init__(self, config: PaddleOCRVisionConfig):
|
|
957
|
+
super().__init__()
|
|
958
|
+
|
|
959
|
+
|
|
960
|
+
class PaddleOCRVisionEncoderLayer(VideoLlama3VisionEncoderLayer):
|
|
961
|
+
def __init__(self, config: PaddleOCRVisionConfig):
|
|
962
|
+
super().__init__()
|
|
963
|
+
|
|
964
|
+
|
|
965
|
+
class PaddleOCRVisionEncoder(VideoLlama3VisionEncoder):
|
|
966
|
+
def __init__(self, config: PaddleOCRVisionConfig):
|
|
967
|
+
super().__init__()
|
|
968
|
+
embed_dim = config.hidden_size
|
|
969
|
+
num_heads = config.num_attention_heads
|
|
970
|
+
head_dim = embed_dim // num_heads
|
|
971
|
+
self.rotary_pos_emb = PaddleOCRVisionRotaryEmbedding(head_dim // 2)
|
|
972
|
+
|
|
973
|
+
def forward(
|
|
974
|
+
self,
|
|
975
|
+
inputs_embeds: torch.FloatTensor,
|
|
976
|
+
cu_seqlens: torch.Tensor,
|
|
977
|
+
attention_mask: Optional[torch.Tensor] = None,
|
|
978
|
+
image_grid_thw: Optional[list[Union[tuple[int, int, int], list[tuple[int, int, int]]]]] = None,
|
|
979
|
+
) -> BaseModelOutput:
|
|
980
|
+
"""
|
|
981
|
+
Args:
|
|
982
|
+
inputs_embeds (`torch.FloatTensor` of shape `(sequence_length, hidden_size)`, *optional*):
|
|
983
|
+
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
|
|
984
|
+
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
|
|
985
|
+
than the model's internal embedding lookup matrix.
|
|
986
|
+
cu_seqlens (`torch.Tensor` of shape `(num_images + 1,)`):
|
|
987
|
+
The cumulative sequence lengths of each image or video feature.
|
|
988
|
+
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
989
|
+
The attention_mask used in forward function shape [batch_size X sequence_length] if not None.
|
|
990
|
+
image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
|
|
991
|
+
The temporal, height and width of feature shape of each image in LLM.
|
|
992
|
+
"""
|
|
993
|
+
device = inputs_embeds.device
|
|
994
|
+
hidden_states = inputs_embeds
|
|
995
|
+
attention_mask = create_bidirectional_mask(
|
|
996
|
+
config=self.config,
|
|
997
|
+
input_embeds=inputs_embeds,
|
|
998
|
+
attention_mask=attention_mask,
|
|
999
|
+
)
|
|
1000
|
+
split_hids = []
|
|
1001
|
+
split_wids = []
|
|
1002
|
+
for t, h, w in image_grid_thw:
|
|
1003
|
+
image_pids = torch.arange(t * h * w, device=device) % (h * w)
|
|
1004
|
+
sample_hids = image_pids // w
|
|
1005
|
+
sample_wids = image_pids % w
|
|
1006
|
+
split_hids.append(sample_hids)
|
|
1007
|
+
split_wids.append(sample_wids)
|
|
1008
|
+
width_position_ids = torch.concat(split_wids, dim=0)
|
|
1009
|
+
height_position_ids = torch.concat(split_hids, dim=0)
|
|
1010
|
+
|
|
1011
|
+
pids = torch.stack([height_position_ids, width_position_ids], dim=-1)
|
|
1012
|
+
max_grid_size = pids.max() + 1
|
|
1013
|
+
rotary_embeddings_max_grid = self.rotary_pos_emb(max_grid_size)
|
|
1014
|
+
rotary_embeddings = rotary_embeddings_max_grid[pids].flatten(1)
|
|
1015
|
+
rotary_embeddings = rotary_embeddings.repeat(1, 2)
|
|
1016
|
+
position_embeddings = (rotary_embeddings.cos(), rotary_embeddings.sin())
|
|
1017
|
+
|
|
1018
|
+
for encoder_layer in self.layers:
|
|
1019
|
+
hidden_states = encoder_layer(
|
|
1020
|
+
hidden_states,
|
|
1021
|
+
cu_seqlens=cu_seqlens,
|
|
1022
|
+
position_embeddings=position_embeddings,
|
|
1023
|
+
)
|
|
1024
|
+
|
|
1025
|
+
return BaseModelOutput(
|
|
1026
|
+
last_hidden_state=hidden_states,
|
|
1027
|
+
)
|
|
1028
|
+
|
|
1029
|
+
|
|
1030
|
+
class PaddleOCRVisionTransformer(PaddleOCRVLPreTrainedModel):
|
|
1031
|
+
def __init__(self, config: PaddleOCRVisionConfig):
|
|
1032
|
+
super().__init__(config)
|
|
1033
|
+
self.config = config
|
|
1034
|
+
embed_dim = config.hidden_size
|
|
1035
|
+
|
|
1036
|
+
self.embeddings = PaddleOCRVisionEmbeddings(config)
|
|
1037
|
+
self.encoder = PaddleOCRVisionEncoder(config)
|
|
1038
|
+
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
|
|
1039
|
+
|
|
1040
|
+
def forward(
|
|
1041
|
+
self,
|
|
1042
|
+
pixel_values: torch.FloatTensor,
|
|
1043
|
+
cu_seqlens: torch.Tensor,
|
|
1044
|
+
attention_mask: Optional[torch.Tensor] = None,
|
|
1045
|
+
image_grid_thw: Optional[list[Union[tuple[int, int, int], list[tuple[int, int, int]]]]] = None,
|
|
1046
|
+
**kwargs,
|
|
1047
|
+
) -> BaseModelOutputWithPooling:
|
|
1048
|
+
"""
|
|
1049
|
+
Args:
|
|
1050
|
+
pixel_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, patch_size * patch_size * image_channels)`):
|
|
1051
|
+
The tensors corresponding to the input images.
|
|
1052
|
+
cu_seqlens (`torch.Tensor` of shape `(num_images + 1,)`):
|
|
1053
|
+
The cumulative sequence lengths of each image or video feature.
|
|
1054
|
+
attention_mask (`torch.Tensor`, *optional*):
|
|
1055
|
+
The attention_mask used in forward function shape [batch_size X sequence_length] if not None.
|
|
1056
|
+
image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
|
|
1057
|
+
The temporal, height and width of feature shape of each image in LLM.
|
|
1058
|
+
"""
|
|
1059
|
+
hidden_states = self.embeddings(pixel_values, image_grid_thw=image_grid_thw)
|
|
1060
|
+
|
|
1061
|
+
encoder_outputs: BaseModelOutput = self.encoder(
|
|
1062
|
+
inputs_embeds=hidden_states,
|
|
1063
|
+
cu_seqlens=cu_seqlens,
|
|
1064
|
+
attention_mask=attention_mask,
|
|
1065
|
+
image_grid_thw=image_grid_thw,
|
|
1066
|
+
)
|
|
1067
|
+
|
|
1068
|
+
last_hidden_state = encoder_outputs.last_hidden_state
|
|
1069
|
+
last_hidden_state = self.post_layernorm(last_hidden_state)
|
|
1070
|
+
|
|
1071
|
+
return BaseModelOutputWithPooling(
|
|
1072
|
+
last_hidden_state=last_hidden_state,
|
|
1073
|
+
pooler_output=None,
|
|
1074
|
+
hidden_states=encoder_outputs.hidden_states,
|
|
1075
|
+
attentions=encoder_outputs.attentions,
|
|
1076
|
+
)
|
|
1077
|
+
|
|
1078
|
+
|
|
1079
|
+
class PaddleOCRVLModelOutputWithPast(Qwen2VLModelOutputWithPast):
|
|
1080
|
+
pass
|
|
1081
|
+
|
|
1082
|
+
|
|
1083
|
+
class PaddleOCRVLCausalLMOutputWithPast(Qwen2VLCausalLMOutputWithPast):
|
|
1084
|
+
pass
|
|
1085
|
+
|
|
1086
|
+
|
|
1087
|
+
class PaddleOCRVLModel(Qwen2VLModel):
|
|
1088
|
+
_checkpoint_conversion_mapping = {"^model": "language_model"}
|
|
1089
|
+
_keys_to_ignore_on_load_unexpected = ["packing_position_embedding", "vision_model.head"]
|
|
1090
|
+
|
|
1091
|
+
def __init__(self, config: PaddleOCRVLConfig):
|
|
1092
|
+
super().__init__(config)
|
|
1093
|
+
self.visual = PaddleOCRVisionModel._from_config(config.vision_config)
|
|
1094
|
+
self.projector = PaddleOCRProjector(config)
|
|
1095
|
+
self.language_model = PaddleOCRTextModel._from_config(config.text_config)
|
|
1096
|
+
self.rope_deltas = None
|
|
1097
|
+
|
|
1098
|
+
self.post_init()
|
|
1099
|
+
|
|
1100
|
+
def get_input_embeddings(self):
|
|
1101
|
+
return self.language_model.embed_tokens
|
|
1102
|
+
|
|
1103
|
+
def set_input_embeddings(self, value):
|
|
1104
|
+
self.language_model.embed_tokens = value
|
|
1105
|
+
|
|
1106
|
+
def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: Optional[torch.LongTensor] = None):
|
|
1107
|
+
"""
|
|
1108
|
+
Encodes images into continuous embeddings that can be forwarded to the language model.
|
|
1109
|
+
|
|
1110
|
+
Args:
|
|
1111
|
+
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
|
|
1112
|
+
The tensors corresponding to the input images.
|
|
1113
|
+
image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
|
|
1114
|
+
The temporal, height and width of feature shape of each image in LLM.
|
|
1115
|
+
"""
|
|
1116
|
+
pixel_values = pixel_values.type(self.visual.dtype).unsqueeze(0)
|
|
1117
|
+
cu_seqlens = torch.repeat_interleave(image_grid_thw[:, 1] * image_grid_thw[:, 2], image_grid_thw[:, 0]).cumsum(
|
|
1118
|
+
dim=0,
|
|
1119
|
+
# Select dtype based on the following factors:
|
|
1120
|
+
# - FA2 requires that cu_seqlens_q must have dtype int32
|
|
1121
|
+
# - torch.onnx.export requires that cu_seqlens_q must have same dtype as grid_thw
|
|
1122
|
+
# See https://github.com/huggingface/transformers/pull/34852 for more information
|
|
1123
|
+
dtype=image_grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
|
|
1124
|
+
)
|
|
1125
|
+
cu_seqlens = torch.nn.functional.pad(cu_seqlens, (1, 0), value=0)
|
|
1126
|
+
vision_outputs = self.visual(
|
|
1127
|
+
pixel_values=pixel_values,
|
|
1128
|
+
image_grid_thw=image_grid_thw,
|
|
1129
|
+
cu_seqlens=cu_seqlens,
|
|
1130
|
+
)
|
|
1131
|
+
image_embeds = vision_outputs.last_hidden_state
|
|
1132
|
+
image_embeds = self.projector(image_embeds, image_grid_thw)
|
|
1133
|
+
return image_embeds
|
|
1134
|
+
|
|
1135
|
+
def get_placeholder_mask(
|
|
1136
|
+
self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor
|
|
1137
|
+
):
|
|
1138
|
+
"""
|
|
1139
|
+
Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
|
|
1140
|
+
equal to the length of multimodal features. If the lengths are different, an error is raised.
|
|
1141
|
+
"""
|
|
1142
|
+
if input_ids is None:
|
|
1143
|
+
special_image_mask = inputs_embeds == self.get_input_embeddings()(
|
|
1144
|
+
torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
|
|
1145
|
+
)
|
|
1146
|
+
special_image_mask = special_image_mask.all(-1)
|
|
1147
|
+
else:
|
|
1148
|
+
special_image_mask = input_ids == self.config.image_token_id
|
|
1149
|
+
|
|
1150
|
+
n_image_tokens = special_image_mask.sum()
|
|
1151
|
+
special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
|
|
1152
|
+
n_image_features = image_features.shape[0] * image_features.shape[1]
|
|
1153
|
+
if inputs_embeds[special_image_mask].numel() != image_features.numel():
|
|
1154
|
+
raise ValueError(
|
|
1155
|
+
f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
|
|
1156
|
+
)
|
|
1157
|
+
return special_image_mask
|
|
1158
|
+
|
|
1159
|
+
@can_return_tuple
|
|
1160
|
+
def forward(
|
|
1161
|
+
self,
|
|
1162
|
+
input_ids: torch.LongTensor = None,
|
|
1163
|
+
attention_mask: Optional[torch.Tensor] = None,
|
|
1164
|
+
position_ids: Optional[torch.LongTensor] = None,
|
|
1165
|
+
past_key_values: Optional[list[torch.FloatTensor]] = None,
|
|
1166
|
+
inputs_embeds: Optional[torch.FloatTensor] = None,
|
|
1167
|
+
use_cache: Optional[bool] = None,
|
|
1168
|
+
pixel_values: Optional[torch.Tensor] = None,
|
|
1169
|
+
image_grid_thw: Optional[torch.LongTensor] = None,
|
|
1170
|
+
rope_deltas: Optional[torch.LongTensor] = None,
|
|
1171
|
+
cache_position: Optional[torch.LongTensor] = None,
|
|
1172
|
+
**kwargs,
|
|
1173
|
+
) -> Union[tuple, PaddleOCRVLModelOutputWithPast]:
|
|
1174
|
+
r"""
|
|
1175
|
+
image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
|
|
1176
|
+
The temporal, height and width of feature shape of each image in LLM.
|
|
1177
|
+
rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
|
|
1178
|
+
The rope index difference between sequence length and multimodal rope.
|
|
1179
|
+
"""
|
|
1180
|
+
if inputs_embeds is None:
|
|
1181
|
+
inputs_embeds = self.language_model.embed_tokens(input_ids)
|
|
1182
|
+
|
|
1183
|
+
if pixel_values is not None:
|
|
1184
|
+
image_embeds = self.get_image_features(pixel_values, image_grid_thw).to(
|
|
1185
|
+
inputs_embeds.device, inputs_embeds.dtype
|
|
1186
|
+
)
|
|
1187
|
+
image_mask = self.get_placeholder_mask(input_ids, inputs_embeds=inputs_embeds, image_features=image_embeds)
|
|
1188
|
+
inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
|
|
1189
|
+
|
|
1190
|
+
if position_ids is None:
|
|
1191
|
+
past_key_values_length = 0 if past_key_values is None else past_key_values.get_seq_length()
|
|
1192
|
+
if self.rope_deltas is None or past_key_values_length == 0:
|
|
1193
|
+
position_ids, rope_deltas = self.get_rope_index(
|
|
1194
|
+
input_ids=input_ids,
|
|
1195
|
+
image_grid_thw=image_grid_thw,
|
|
1196
|
+
attention_mask=attention_mask,
|
|
1197
|
+
)
|
|
1198
|
+
self.rope_deltas = rope_deltas
|
|
1199
|
+
# then use the prev pre-calculated rope-deltas to get the correct position ids
|
|
1200
|
+
else:
|
|
1201
|
+
batch_size, seq_length, _ = inputs_embeds.shape
|
|
1202
|
+
position_ids = torch.arange(seq_length, device=inputs_embeds.device)
|
|
1203
|
+
position_ids = position_ids.view(1, 1, -1).expand(3, batch_size, -1)
|
|
1204
|
+
delta = (past_key_values_length + self.rope_deltas).to(inputs_embeds.device)
|
|
1205
|
+
delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0)
|
|
1206
|
+
position_ids = position_ids + delta.to(position_ids.device)
|
|
1207
|
+
|
|
1208
|
+
outputs = self.language_model(
|
|
1209
|
+
input_ids=None,
|
|
1210
|
+
position_ids=position_ids,
|
|
1211
|
+
attention_mask=attention_mask,
|
|
1212
|
+
past_key_values=past_key_values,
|
|
1213
|
+
inputs_embeds=inputs_embeds,
|
|
1214
|
+
use_cache=use_cache,
|
|
1215
|
+
cache_position=cache_position,
|
|
1216
|
+
**kwargs,
|
|
1217
|
+
)
|
|
1218
|
+
|
|
1219
|
+
output = PaddleOCRVLModelOutputWithPast(
|
|
1220
|
+
last_hidden_state=outputs.last_hidden_state,
|
|
1221
|
+
past_key_values=outputs.past_key_values,
|
|
1222
|
+
hidden_states=outputs.hidden_states,
|
|
1223
|
+
attentions=outputs.attentions,
|
|
1224
|
+
rope_deltas=self.rope_deltas,
|
|
1225
|
+
)
|
|
1226
|
+
|
|
1227
|
+
return output
|
|
1228
|
+
|
|
1229
|
+
|
|
1230
|
+
class PaddleOCRVLForConditionalGeneration(Qwen2VLForConditionalGeneration):
|
|
1231
|
+
_checkpoint_conversion_mapping = {
|
|
1232
|
+
"^visual": "model.visual",
|
|
1233
|
+
"^mlp_AR": "model.projector",
|
|
1234
|
+
r"^model(?!(\.visual|\.projector|\.language_model))": "model.language_model",
|
|
1235
|
+
}
|
|
1236
|
+
_keys_to_ignore_on_load_unexpected = ["packing_position_embedding", "vision_model.head"]
|
|
1237
|
+
|
|
1238
|
+
@can_return_tuple
|
|
1239
|
+
@auto_docstring
|
|
1240
|
+
def forward(
|
|
1241
|
+
self,
|
|
1242
|
+
input_ids: Optional[torch.LongTensor] = None,
|
|
1243
|
+
attention_mask: Optional[torch.Tensor] = None,
|
|
1244
|
+
position_ids: Optional[torch.LongTensor] = None,
|
|
1245
|
+
past_key_values: Optional[Cache] = None,
|
|
1246
|
+
inputs_embeds: Optional[torch.FloatTensor] = None,
|
|
1247
|
+
labels: Optional[torch.LongTensor] = None,
|
|
1248
|
+
use_cache: Optional[bool] = None,
|
|
1249
|
+
pixel_values: Optional[torch.Tensor] = None,
|
|
1250
|
+
image_grid_thw: Optional[torch.LongTensor] = None,
|
|
1251
|
+
rope_deltas: Optional[torch.LongTensor] = None,
|
|
1252
|
+
cache_position: Optional[torch.LongTensor] = None,
|
|
1253
|
+
logits_to_keep: Union[int, torch.Tensor] = 0,
|
|
1254
|
+
**kwargs: Unpack[TransformersKwargs],
|
|
1255
|
+
) -> Union[tuple, PaddleOCRVLCausalLMOutputWithPast]:
|
|
1256
|
+
r"""
|
|
1257
|
+
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
|
1258
|
+
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
|
1259
|
+
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
|
1260
|
+
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
|
1261
|
+
image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
|
|
1262
|
+
The temporal, height and width of feature shape of each image in LLM.
|
|
1263
|
+
rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
|
|
1264
|
+
The rope index difference between sequence length and multimodal rope.
|
|
1265
|
+
|
|
1266
|
+
Example:
|
|
1267
|
+
|
|
1268
|
+
```python
|
|
1269
|
+
>>> from transformers import AutoProcessor, PaddleOCRVLForConditionalGeneration
|
|
1270
|
+
|
|
1271
|
+
>>> model = PaddleOCRVLForConditionalGeneration.from_pretrained("PaddlePaddle/PaddleOCR-VL", dtype="bfloat16")
|
|
1272
|
+
>>> processor = AutoProcessor.from_pretrained("PaddlePaddle/PaddleOCR-VL")
|
|
1273
|
+
|
|
1274
|
+
>>> messages = [
|
|
1275
|
+
{
|
|
1276
|
+
"role": "user",
|
|
1277
|
+
"content": [
|
|
1278
|
+
{
|
|
1279
|
+
"type": "image",
|
|
1280
|
+
"image": "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/ocr_demo.jpg",
|
|
1281
|
+
},
|
|
1282
|
+
{"type": "text", "text": "OCR:"},
|
|
1283
|
+
],
|
|
1284
|
+
}
|
|
1285
|
+
]
|
|
1286
|
+
|
|
1287
|
+
>>> inputs = processor.apply_chat_template(
|
|
1288
|
+
messages,
|
|
1289
|
+
tokenize=True,
|
|
1290
|
+
add_generation_prompt=True,
|
|
1291
|
+
return_dict=True,
|
|
1292
|
+
return_tensors="pt"
|
|
1293
|
+
).to(model.device)
|
|
1294
|
+
|
|
1295
|
+
>>> # Generate
|
|
1296
|
+
>>> generated_ids = model.generate(**inputs, max_new_tokens=1024)
|
|
1297
|
+
>>> generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
|
|
1298
|
+
>>> output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
|
1299
|
+
>>> print(output_text)
|
|
1300
|
+
```
|
|
1301
|
+
"""
|
|
1302
|
+
outputs: PaddleOCRVLModelOutputWithPast = self.model(
|
|
1303
|
+
input_ids=input_ids,
|
|
1304
|
+
attention_mask=attention_mask,
|
|
1305
|
+
position_ids=position_ids,
|
|
1306
|
+
image_grid_thw=image_grid_thw,
|
|
1307
|
+
past_key_values=past_key_values,
|
|
1308
|
+
inputs_embeds=inputs_embeds,
|
|
1309
|
+
use_cache=use_cache,
|
|
1310
|
+
pixel_values=pixel_values,
|
|
1311
|
+
rope_deltas=rope_deltas,
|
|
1312
|
+
cache_position=cache_position,
|
|
1313
|
+
**kwargs,
|
|
1314
|
+
)
|
|
1315
|
+
hidden_states = outputs.last_hidden_state
|
|
1316
|
+
|
|
1317
|
+
slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
|
|
1318
|
+
logits = self.lm_head(hidden_states[:, slice_indices, :])
|
|
1319
|
+
|
|
1320
|
+
loss = None
|
|
1321
|
+
if labels is not None:
|
|
1322
|
+
loss = self.loss_function(
|
|
1323
|
+
logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
|
|
1324
|
+
)
|
|
1325
|
+
|
|
1326
|
+
return PaddleOCRVLCausalLMOutputWithPast(
|
|
1327
|
+
loss=loss,
|
|
1328
|
+
logits=logits,
|
|
1329
|
+
past_key_values=outputs.past_key_values,
|
|
1330
|
+
hidden_states=outputs.hidden_states,
|
|
1331
|
+
attentions=outputs.attentions,
|
|
1332
|
+
rope_deltas=outputs.rope_deltas,
|
|
1333
|
+
)
|
|
1334
|
+
|
|
1335
|
+
|
|
1336
|
+
__all__ = [
|
|
1337
|
+
"PaddleOCRVLForConditionalGeneration",
|
|
1338
|
+
"PaddleOCRVLModel",
|
|
1339
|
+
"PaddleOCRVLPreTrainedModel",
|
|
1340
|
+
"PaddleOCRVisionTransformer",
|
|
1341
|
+
"PaddleOCRVLConfig",
|
|
1342
|
+
"PaddleOCRTextModel",
|
|
1343
|
+
"PaddleOCRVisionModel",
|
|
1344
|
+
"PaddleOCRVisionConfig",
|
|
1345
|
+
"PaddleOCRTextConfig",
|
|
1346
|
+
"PaddleOCRVLImageProcessor",
|
|
1347
|
+
"PaddleOCRVLImageProcessorFast",
|
|
1348
|
+
"PaddleOCRVLProcessor",
|
|
1349
|
+
]
|