transformers 4.57.3__py3-none-any.whl → 4.57.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +1 -1
- transformers/generation/utils.py +4 -2
- transformers/models/apertus/modeling_apertus.py +1 -1
- transformers/models/arcee/modeling_arcee.py +1 -1
- transformers/models/aria/modeling_aria.py +1 -1
- transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +1 -1
- transformers/models/aya_vision/modeling_aya_vision.py +1 -1
- transformers/models/aya_vision/modular_aya_vision.py +1 -1
- transformers/models/bitnet/modeling_bitnet.py +1 -1
- transformers/models/blip_2/modeling_blip_2.py +1 -1
- transformers/models/blt/modeling_blt.py +2 -2
- transformers/models/blt/modular_blt.py +2 -2
- transformers/models/cohere/modeling_cohere.py +1 -1
- transformers/models/cohere2/modeling_cohere2.py +1 -1
- transformers/models/cohere2_vision/modeling_cohere2_vision.py +2 -2
- transformers/models/cohere2_vision/modular_cohere2_vision.py +2 -2
- transformers/models/csm/modeling_csm.py +2 -2
- transformers/models/csm/modular_csm.py +2 -2
- transformers/models/deepseek_v2/modeling_deepseek_v2.py +1 -1
- transformers/models/deepseek_v3/modeling_deepseek_v3.py +1 -1
- transformers/models/diffllama/modeling_diffllama.py +1 -1
- transformers/models/dinov2/modeling_dinov2.py +1 -1
- transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py +1 -1
- transformers/models/doge/modeling_doge.py +1 -1
- transformers/models/dots1/modeling_dots1.py +1 -1
- transformers/models/edgetam/modeling_edgetam.py +2 -2
- transformers/models/edgetam/modular_edgetam.py +1 -1
- transformers/models/efficientloftr/modeling_efficientloftr.py +1 -1
- transformers/models/emu3/modeling_emu3.py +1 -1
- transformers/models/eomt/modeling_eomt.py +1 -1
- transformers/models/eomt/modular_eomt.py +1 -1
- transformers/models/ernie4_5/modeling_ernie4_5.py +1 -1
- transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +1 -1
- transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py +1 -1
- transformers/models/esm/modeling_esm.py +1 -1
- transformers/models/evolla/modeling_evolla.py +2 -2
- transformers/models/evolla/modular_evolla.py +2 -2
- transformers/models/exaone4/modeling_exaone4.py +1 -1
- transformers/models/exaone4/modular_exaone4.py +1 -1
- transformers/models/flex_olmo/modeling_flex_olmo.py +1 -1
- transformers/models/flex_olmo/modular_flex_olmo.py +1 -1
- transformers/models/gemma/modeling_gemma.py +1 -1
- transformers/models/gemma2/modeling_gemma2.py +1 -1
- transformers/models/gemma3/modeling_gemma3.py +1 -1
- transformers/models/glm/modeling_glm.py +1 -1
- transformers/models/glm4/modeling_glm4.py +1 -1
- transformers/models/glm4_moe/modeling_glm4_moe.py +1 -1
- transformers/models/glm4v/modeling_glm4v.py +1 -1
- transformers/models/glm4v/modular_glm4v.py +1 -1
- transformers/models/glm4v_moe/modeling_glm4v_moe.py +1 -1
- transformers/models/gpt_neox/modeling_gpt_neox.py +1 -1
- transformers/models/gpt_oss/modeling_gpt_oss.py +1 -1
- transformers/models/gpt_oss/modular_gpt_oss.py +1 -1
- transformers/models/granite/modeling_granite.py +1 -1
- transformers/models/helium/modeling_helium.py +1 -1
- transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +1 -1
- transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +1 -1
- transformers/models/idefics/modeling_idefics.py +1 -1
- transformers/models/instructblip/modeling_instructblip.py +1 -1
- transformers/models/instructblipvideo/modeling_instructblipvideo.py +1 -1
- transformers/models/lfm2/modeling_lfm2.py +1 -1
- transformers/models/llama/modeling_llama.py +1 -1
- transformers/models/llama4/modeling_llama4.py +1 -1
- transformers/models/longcat_flash/modeling_longcat_flash.py +1 -1
- transformers/models/minimax/modeling_minimax.py +1 -1
- transformers/models/minimax/modular_minimax.py +1 -1
- transformers/models/ministral/modeling_ministral.py +1 -1
- transformers/models/ministral/modular_ministral.py +1 -1
- transformers/models/mistral/modeling_mistral.py +1 -1
- transformers/models/mistral/modular_mistral.py +1 -1
- transformers/models/mixtral/modeling_mixtral.py +1 -1
- transformers/models/mllama/modeling_mllama.py +3 -3
- transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +1 -1
- transformers/models/modernbert_decoder/modular_modernbert_decoder.py +1 -1
- transformers/models/moonshine/modeling_moonshine.py +2 -2
- transformers/models/moonshine/modular_moonshine.py +2 -2
- transformers/models/olmo/modeling_olmo.py +1 -1
- transformers/models/olmo2/modeling_olmo2.py +1 -1
- transformers/models/olmo3/modeling_olmo3.py +1 -1
- transformers/models/parakeet/modeling_parakeet.py +1 -1
- transformers/models/parakeet/modular_parakeet.py +1 -1
- transformers/models/phi/modeling_phi.py +1 -1
- transformers/models/phi3/modeling_phi3.py +1 -1
- transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +1 -1
- transformers/models/phi4_multimodal/modular_phi4_multimodal.py +1 -1
- transformers/models/qwen2/modeling_qwen2.py +1 -1
- transformers/models/qwen2/modular_qwen2.py +1 -1
- transformers/models/qwen3/modeling_qwen3.py +1 -1
- transformers/models/qwen3_moe/modeling_qwen3_moe.py +1 -1
- transformers/models/qwen3_next/modeling_qwen3_next.py +1 -1
- transformers/models/qwen3_next/modular_qwen3_next.py +1 -1
- transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +4 -4
- transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +1 -1
- transformers/models/qwen3_vl/modeling_qwen3_vl.py +3 -3
- transformers/models/qwen3_vl/modular_qwen3_vl.py +3 -3
- transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +3 -3
- transformers/models/sam/modeling_sam.py +1 -1
- transformers/models/sam2/modeling_sam2.py +3 -3
- transformers/models/sam2/modular_sam2.py +3 -3
- transformers/models/sam_hq/modeling_sam_hq.py +1 -1
- transformers/models/seed_oss/modeling_seed_oss.py +1 -1
- transformers/models/siglip/modeling_siglip.py +1 -1
- transformers/models/siglip2/modeling_siglip2.py +1 -1
- transformers/models/smollm3/modeling_smollm3.py +1 -1
- transformers/models/starcoder2/modeling_starcoder2.py +1 -1
- transformers/models/starcoder2/modular_starcoder2.py +1 -1
- transformers/models/t5gemma/modeling_t5gemma.py +2 -2
- transformers/models/t5gemma/modular_t5gemma.py +2 -2
- transformers/models/vaultgemma/modeling_vaultgemma.py +1 -1
- transformers/models/voxtral/modeling_voxtral.py +1 -1
- transformers/models/voxtral/modular_voxtral.py +1 -1
- transformers/tokenization_utils_base.py +6 -1
- transformers/utils/generic.py +3 -1
- {transformers-4.57.3.dist-info → transformers-4.57.4.dist-info}/METADATA +1 -1
- {transformers-4.57.3.dist-info → transformers-4.57.4.dist-info}/RECORD +119 -119
- {transformers-4.57.3.dist-info → transformers-4.57.4.dist-info}/WHEEL +0 -0
- {transformers-4.57.3.dist-info → transformers-4.57.4.dist-info}/entry_points.txt +0 -0
- {transformers-4.57.3.dist-info → transformers-4.57.4.dist-info}/licenses/LICENSE +0 -0
- {transformers-4.57.3.dist-info → transformers-4.57.4.dist-info}/top_level.txt +0 -0
transformers/__init__.py
CHANGED
|
@@ -18,7 +18,7 @@
|
|
|
18
18
|
# to defer the actual importing for when the objects are requested. This way `import transformers` provides the names
|
|
19
19
|
# in the namespace without actually importing anything (and especially none of the backends).
|
|
20
20
|
|
|
21
|
-
__version__ = "4.57.
|
|
21
|
+
__version__ = "4.57.4"
|
|
22
22
|
|
|
23
23
|
from pathlib import Path
|
|
24
24
|
from typing import TYPE_CHECKING
|
transformers/generation/utils.py
CHANGED
|
@@ -2379,9 +2379,11 @@ class GenerationMixin(ContinuousMixin):
|
|
|
2379
2379
|
generation_config, use_model_defaults, **kwargs
|
|
2380
2380
|
)
|
|
2381
2381
|
generation_mode = generation_config.get_generation_mode(assistant_model)
|
|
2382
|
+
deprecated_mode_repo = self._get_deprecated_gen_repo(generation_mode, trust_remote_code, custom_generate)
|
|
2383
|
+
|
|
2382
2384
|
if isinstance(custom_generate, Callable):
|
|
2383
2385
|
decoding_method = custom_generate
|
|
2384
|
-
|
|
2386
|
+
elif deprecated_mode_repo is None:
|
|
2385
2387
|
# type() required to access the unbound class-level method
|
|
2386
2388
|
decoding_method = getattr(type(self), GENERATION_MODES_MAPPING[generation_mode])
|
|
2387
2389
|
|
|
@@ -2392,7 +2394,7 @@ class GenerationMixin(ContinuousMixin):
|
|
|
2392
2394
|
# NOTE: This must come after initializing generation_config, since we need it to determine if this is a deprecated mode.
|
|
2393
2395
|
# It must also be before any preparation steps, since Hub repos expect to be loaded before preparation steps.
|
|
2394
2396
|
# TODO joao, manuel: remove this in v4.62.0
|
|
2395
|
-
if deprecated_mode_repo
|
|
2397
|
+
if deprecated_mode_repo is not None:
|
|
2396
2398
|
return GenerationMixin.generate(
|
|
2397
2399
|
self,
|
|
2398
2400
|
inputs=inputs,
|
|
@@ -1007,7 +1007,7 @@ class Blip2QFormerModel(Blip2PreTrainedModel):
|
|
|
1007
1007
|
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
|
|
1008
1008
|
return extended_attention_mask
|
|
1009
1009
|
|
|
1010
|
-
@check_model_inputs
|
|
1010
|
+
@check_model_inputs
|
|
1011
1011
|
@auto_docstring
|
|
1012
1012
|
def forward(
|
|
1013
1013
|
self,
|
|
@@ -577,7 +577,7 @@ class BltLocalDecoder(BltPreTrainedModel):
|
|
|
577
577
|
|
|
578
578
|
self.post_init()
|
|
579
579
|
|
|
580
|
-
@check_model_inputs
|
|
580
|
+
@check_model_inputs
|
|
581
581
|
def forward(
|
|
582
582
|
self,
|
|
583
583
|
input_ids: Optional[torch.LongTensor] = None,
|
|
@@ -1047,7 +1047,7 @@ class BltModel(BltPreTrainedModel):
|
|
|
1047
1047
|
self.patcher = None
|
|
1048
1048
|
self.post_init()
|
|
1049
1049
|
|
|
1050
|
-
@check_model_inputs
|
|
1050
|
+
@check_model_inputs
|
|
1051
1051
|
def forward(
|
|
1052
1052
|
self,
|
|
1053
1053
|
input_ids: Optional[torch.LongTensor] = None,
|
|
@@ -536,7 +536,7 @@ class BltLocalDecoder(BltPreTrainedModel):
|
|
|
536
536
|
|
|
537
537
|
self.post_init()
|
|
538
538
|
|
|
539
|
-
@check_model_inputs
|
|
539
|
+
@check_model_inputs
|
|
540
540
|
def forward(
|
|
541
541
|
self,
|
|
542
542
|
input_ids: Optional[torch.LongTensor] = None,
|
|
@@ -799,7 +799,7 @@ class BltModel(BltPreTrainedModel):
|
|
|
799
799
|
self.patcher = None
|
|
800
800
|
self.post_init()
|
|
801
801
|
|
|
802
|
-
@check_model_inputs
|
|
802
|
+
@check_model_inputs
|
|
803
803
|
def forward(
|
|
804
804
|
self,
|
|
805
805
|
input_ids: Optional[torch.LongTensor] = None,
|
|
@@ -213,7 +213,7 @@ class Cohere2VisionModel(Cohere2VisionPreTrainedModel):
|
|
|
213
213
|
)
|
|
214
214
|
return special_image_mask
|
|
215
215
|
|
|
216
|
-
@check_model_inputs
|
|
216
|
+
@check_model_inputs
|
|
217
217
|
@auto_docstring
|
|
218
218
|
def forward(
|
|
219
219
|
self,
|
|
@@ -306,7 +306,7 @@ class Cohere2VisionForConditionalGeneration(Cohere2VisionPreTrainedModel, Genera
|
|
|
306
306
|
def multi_modal_projector(self):
|
|
307
307
|
return self.model.multi_modal_projector
|
|
308
308
|
|
|
309
|
-
@check_model_inputs
|
|
309
|
+
@check_model_inputs
|
|
310
310
|
@auto_docstring
|
|
311
311
|
def forward(
|
|
312
312
|
self,
|
|
@@ -107,7 +107,7 @@ class Cohere2VisionModel(AyaVisionModel):
|
|
|
107
107
|
image_features = self.multi_modal_projector(selected_image_feature)
|
|
108
108
|
return image_features
|
|
109
109
|
|
|
110
|
-
@check_model_inputs
|
|
110
|
+
@check_model_inputs
|
|
111
111
|
@auto_docstring
|
|
112
112
|
def forward(
|
|
113
113
|
self,
|
|
@@ -160,7 +160,7 @@ class Cohere2VisionForConditionalGeneration(AyaVisionForConditionalGeneration):
|
|
|
160
160
|
def get_image_features(self, pixel_values: torch.FloatTensor):
|
|
161
161
|
return self.model.get_image_features(pixel_values=pixel_values)
|
|
162
162
|
|
|
163
|
-
@check_model_inputs
|
|
163
|
+
@check_model_inputs
|
|
164
164
|
@auto_docstring
|
|
165
165
|
def forward(
|
|
166
166
|
self,
|
|
@@ -409,7 +409,7 @@ class CsmDepthDecoderModel(CsmPreTrainedModel):
|
|
|
409
409
|
# Initialize weights and apply final processing
|
|
410
410
|
self.post_init()
|
|
411
411
|
|
|
412
|
-
@check_model_inputs
|
|
412
|
+
@check_model_inputs
|
|
413
413
|
@auto_docstring
|
|
414
414
|
def forward(
|
|
415
415
|
self,
|
|
@@ -662,7 +662,7 @@ class CsmBackboneModel(CsmPreTrainedModel):
|
|
|
662
662
|
# Initialize weights and apply final processing
|
|
663
663
|
self.post_init()
|
|
664
664
|
|
|
665
|
-
@check_model_inputs
|
|
665
|
+
@check_model_inputs
|
|
666
666
|
@auto_docstring
|
|
667
667
|
def forward(
|
|
668
668
|
self,
|
|
@@ -156,7 +156,7 @@ class CsmDepthDecoderModel(LlamaModel, CsmPreTrainedModel):
|
|
|
156
156
|
self.embed_tokens = nn.Embedding((config.num_codebooks * config.vocab_size), config.backbone_hidden_size)
|
|
157
157
|
self.inputs_embeds_projector = nn.Linear(config.backbone_hidden_size, config.hidden_size, bias=False)
|
|
158
158
|
|
|
159
|
-
@check_model_inputs
|
|
159
|
+
@check_model_inputs
|
|
160
160
|
@auto_docstring
|
|
161
161
|
def forward(
|
|
162
162
|
self,
|
|
@@ -395,7 +395,7 @@ class CsmBackboneModel(LlamaModel):
|
|
|
395
395
|
super().__init__(config)
|
|
396
396
|
self.embed_tokens = CsmBackboneModelEmbeddings(config)
|
|
397
397
|
|
|
398
|
-
@check_model_inputs
|
|
398
|
+
@check_model_inputs
|
|
399
399
|
@auto_docstring
|
|
400
400
|
def forward(self, **super_kwargs):
|
|
401
401
|
r"""
|
|
@@ -624,7 +624,7 @@ class Dinov2Backbone(Dinov2PreTrainedModel, BackboneMixin):
|
|
|
624
624
|
def get_input_embeddings(self) -> Dinov2PatchEmbeddings:
|
|
625
625
|
return self.embeddings.patch_embeddings
|
|
626
626
|
|
|
627
|
-
@check_model_inputs
|
|
627
|
+
@check_model_inputs
|
|
628
628
|
@auto_docstring
|
|
629
629
|
def forward(
|
|
630
630
|
self, pixel_values: torch.Tensor, output_hidden_states: Optional[bool] = None, **kwargs
|
|
@@ -644,7 +644,7 @@ class Dinov2WithRegistersBackbone(Dinov2WithRegistersPreTrainedModel, BackboneMi
|
|
|
644
644
|
def get_input_embeddings(self) -> Dinov2WithRegistersPatchEmbeddings:
|
|
645
645
|
return self.embeddings.patch_embeddings
|
|
646
646
|
|
|
647
|
-
@check_model_inputs
|
|
647
|
+
@check_model_inputs
|
|
648
648
|
@auto_docstring
|
|
649
649
|
def forward(
|
|
650
650
|
self,
|
|
@@ -444,7 +444,7 @@ class EdgeTamVisionModel(EdgeTamPreTrainedModel):
|
|
|
444
444
|
|
|
445
445
|
self.post_init()
|
|
446
446
|
|
|
447
|
-
@check_model_inputs
|
|
447
|
+
@check_model_inputs
|
|
448
448
|
def forward(
|
|
449
449
|
self,
|
|
450
450
|
pixel_values: Optional[torch.FloatTensor] = None,
|
|
@@ -1028,7 +1028,7 @@ class EdgeTamModel(EdgeTamPreTrainedModel):
|
|
|
1028
1028
|
)
|
|
1029
1029
|
return prompt_output
|
|
1030
1030
|
|
|
1031
|
-
@check_model_inputs
|
|
1031
|
+
@check_model_inputs
|
|
1032
1032
|
@auto_docstring
|
|
1033
1033
|
def forward(
|
|
1034
1034
|
self,
|
|
@@ -208,7 +208,7 @@ class EdgeTamVisionModel(Sam2VisionModel):
|
|
|
208
208
|
def get_input_embeddings(self):
|
|
209
209
|
raise NotImplementedError("Can't get input embeddings from timm wrapper model")
|
|
210
210
|
|
|
211
|
-
@check_model_inputs
|
|
211
|
+
@check_model_inputs
|
|
212
212
|
def forward(
|
|
213
213
|
self,
|
|
214
214
|
pixel_values: Optional[torch.FloatTensor] = None,
|
|
@@ -1087,7 +1087,7 @@ class EomtForUniversalSegmentation(EomtPreTrainedModel):
|
|
|
1087
1087
|
def get_loss(self, loss_dict: dict[str, Tensor]) -> Tensor:
|
|
1088
1088
|
return sum(loss_dict.values())
|
|
1089
1089
|
|
|
1090
|
-
@check_model_inputs
|
|
1090
|
+
@check_model_inputs
|
|
1091
1091
|
@auto_docstring
|
|
1092
1092
|
def forward(
|
|
1093
1093
|
self,
|
|
@@ -610,7 +610,7 @@ class EvollaSaProtProteinEncoder(EvollaSaProtPreTrainedModel):
|
|
|
610
610
|
for layer, heads in heads_to_prune.items():
|
|
611
611
|
self.encoder.layer[layer].attention.prune_heads(heads)
|
|
612
612
|
|
|
613
|
-
@check_model_inputs
|
|
613
|
+
@check_model_inputs
|
|
614
614
|
def forward(
|
|
615
615
|
self,
|
|
616
616
|
input_ids: Optional[torch.Tensor],
|
|
@@ -1397,7 +1397,7 @@ class EvollaModel(EvollaPreTrainedModel):
|
|
|
1397
1397
|
self.embed_tokens = value
|
|
1398
1398
|
|
|
1399
1399
|
@auto_docstring
|
|
1400
|
-
@check_model_inputs
|
|
1400
|
+
@check_model_inputs
|
|
1401
1401
|
def forward(
|
|
1402
1402
|
self,
|
|
1403
1403
|
input_ids: Optional[torch.LongTensor] = None,
|
|
@@ -241,7 +241,7 @@ class EvollaSaProtProteinEncoder(EvollaSaProtPreTrainedModel):
|
|
|
241
241
|
for layer, heads in heads_to_prune.items():
|
|
242
242
|
self.encoder.layer[layer].attention.prune_heads(heads)
|
|
243
243
|
|
|
244
|
-
@check_model_inputs
|
|
244
|
+
@check_model_inputs
|
|
245
245
|
def forward(
|
|
246
246
|
self,
|
|
247
247
|
input_ids: Optional[torch.Tensor],
|
|
@@ -835,7 +835,7 @@ class EvollaModel(EvollaPreTrainedModel):
|
|
|
835
835
|
self.embed_tokens = value
|
|
836
836
|
|
|
837
837
|
@auto_docstring
|
|
838
|
-
@check_model_inputs
|
|
838
|
+
@check_model_inputs
|
|
839
839
|
def forward(
|
|
840
840
|
self,
|
|
841
841
|
input_ids: Optional[torch.LongTensor] = None,
|
|
@@ -364,7 +364,7 @@ class Exaone4Model(Exaone4PreTrainedModel, LlamaModel):
|
|
|
364
364
|
# Initialize weights and apply final processing
|
|
365
365
|
self.post_init()
|
|
366
366
|
|
|
367
|
-
@check_model_inputs
|
|
367
|
+
@check_model_inputs
|
|
368
368
|
def forward(
|
|
369
369
|
self,
|
|
370
370
|
input_ids: Optional[torch.LongTensor] = None,
|
|
@@ -277,7 +277,7 @@ class FlexOlmoPreTrainedModel(MixtralPreTrainedModel):
|
|
|
277
277
|
# FlexOlmo model is identical to Mixtral model except:
|
|
278
278
|
# - FlexOlmo does not use sliding window attention.
|
|
279
279
|
class FlexOlmoModel(MixtralModel):
|
|
280
|
-
@check_model_inputs
|
|
280
|
+
@check_model_inputs
|
|
281
281
|
@auto_docstring
|
|
282
282
|
def forward(
|
|
283
283
|
self,
|