PyPI - transformers - Versions diffs - 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl - Mend

transformers 5.0.0rc3py3-none-any.whl → 5.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1021) hide show

transformers/integrations/deepspeed.py CHANGED Viewed

@@ -290,12 +290,145 @@ def deepspeed_config():
         return None
-def _load_state_dict_into_zero3_model(model_to_load, state_dict):
+def _apply_weight_conversions_to_state_dict(model, state_dict, weight_mapping):
+    """
+    Apply weight conversions (renaming and merging/splitting operations) to a state dict.
+    This is a simplified version that handles the conversion without loading into the model.
+    """
+    # Check for Tensor Parallelism - weight conversions are not tested with TP
+    # TP uses ReplaceWithTensorSlicing which may conflict with our weight conversions
+    ds_config = deepspeed_config()
+    if ds_config is not None:
+        # Check training config (tensor_parallel.autotp_size)
+        tp_size = ds_config.get("tensor_parallel", {}).get("autotp_size", 1)
+        # Check inference config (inference.tensor_parallel.tp_size)
+        inference_config = ds_config.get("inference", {})
+        if isinstance(inference_config, dict):
+            tp_size = max(tp_size, inference_config.get("tensor_parallel", {}).get("tp_size", 1))
+        if tp_size > 1:
+            raise NotImplementedError(
+                "Weight conversions (e.g., MoE expert fusion) with DeepSpeed Tensor Parallelism "
+                "are not yet implemented but support is coming soon. Please disable tensor_parallel "
+                "in your DeepSpeed config or convert your checkpoint to the expected format first."
+            )
+    from ..core_model_loading import WeightConverter, WeightRenaming, dot_natural_key, rename_source_key
+    # Preserve metadata from the original state dict
+    metadata = getattr(state_dict, "_metadata", None)
+    prefix = model.base_model_prefix
+    # Build a meta state dict for matching - only keys/shapes, no actual tensor data
+    # This minimizes memory since we don't duplicate the model's parameters
+    model_state_dict = {}
+    for key, param in model.state_dict().items():
+        model_state_dict[key] = torch.empty(param.shape, dtype=param.dtype, device="meta")
+    renamings = [entry for entry in weight_mapping if isinstance(entry, WeightRenaming)]
+    converters = [entry for entry in weight_mapping if isinstance(entry, WeightConverter)]
+    # Fast path: if we only have simple renamings and no converters, we can skip the expensive collection logic
+    if len(converters) == 0:
+        new_state_dict = {}
+        for original_key, tensor in state_dict.items():
+            renamed_key, _ = rename_source_key(original_key, renamings, [], prefix, model_state_dict)
+            if renamed_key in model_state_dict:
+                new_state_dict[renamed_key] = tensor
+        # Attach metadata to the new state dict
+        if metadata is not None:
+            new_state_dict._metadata = metadata
+        return new_state_dict
+    # Full path: we have WeightConverter operations that require tensor fusion/splitting
+    pattern_to_converter = {k: converter for converter in converters for k in converter.source_patterns}
+    # Build a mapping of what needs to be converted
+    # Sort keys to ensure consistent ordering (important for MoE conversions)
+    # Iterate over sorted keys and pop from state_dict to free memory immediately
+    conversion_mapping = {}
+    key_rename_cache = {}  # Cache rename results to avoid redundant processing
+    new_state_dict = {}  # Initialize here for direct key copies (non-converted keys)
+    sorted_keys = sorted(state_dict.keys(), key=lambda k: dot_natural_key(k))
+    for original_key in sorted_keys:
+        tensor = state_dict.pop(original_key)  # Pop to free memory immediately
+        # Rename the key according to all renaming pattern and optional weight converter patterns
+        renamed_key, source_pattern = rename_source_key(original_key, renamings, converters, prefix, model_state_dict)
+        # Cache the rename result for use in the cleanup loop
+        key_rename_cache[original_key] = renamed_key
+        # Only process if the renamed key is in the model's state dict
+        if renamed_key in model_state_dict:
+            # If source_pattern is not None, this key needs WeightConverter (e.g., MoE fusion)
+            if source_pattern is not None:
+                # Create a fresh converter for this layer to hold its tensors
+                # Share operations list (lightweight, no large data) but get new collected_tensors
+                converter = pattern_to_converter[source_pattern]
+                new_converter = WeightConverter(
+                    source_patterns=converter.source_patterns,
+                    target_patterns=converter.target_patterns,
+                    operations=converter.operations,
+                )
+                mapping = conversion_mapping.setdefault(renamed_key, new_converter)
+                mapping.add_tensor(renamed_key, original_key, source_pattern, tensor)
+            else:
+                # No conversion needed - add tensor directly to new_state_dict
+                # (this handles keys like embed_tokens, lm_head, layernorm, attention)
+                new_state_dict[renamed_key] = tensor
+    # Apply the conversions and build the new state dict
+    for renamed_key, mapping in conversion_mapping.items():
+        try:
+            # Only WeightConverter needs convert(); WeightRenaming is just a simple rename
+            if not isinstance(mapping, WeightConverter):
+                continue
+            realized_value, _ = mapping.convert(
+                renamed_key,
+                model=model,
+                config=model.config,
+            )
+            for target_name, param in realized_value.items():
+                param = param[0] if isinstance(param, list) else param
+                new_state_dict[target_name] = param
+            # Free memory by clearing source tensors
+            if hasattr(mapping, "source_tensors"):
+                mapping.source_tensors = {}
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to apply weight conversion for '{renamed_key}'. "
+                f"This likely means the checkpoint format is incompatible with the current model version. "
+                f"Error: {e}"
+            ) from e
+    # Add any keys that didn't need conversion (use cached rename results)
+    # At this point, state_dict only contains unconverted keys (others were popped)
+    for key in list(state_dict.keys()):
+        renamed_key = key_rename_cache.get(key)
+        if renamed_key is None:
+            # Key wasn't in our cache, compute rename
+            renamed_key, _ = rename_source_key(key, renamings, [], prefix, model_state_dict)
+        if renamed_key not in new_state_dict and renamed_key in model_state_dict:
+            new_state_dict[renamed_key] = state_dict.pop(key)
+    # Attach metadata to the new state dict
+    if metadata is not None:
+        new_state_dict._metadata = metadata
+    return new_state_dict
+def _load_state_dict_into_zero3_model(model_to_load, state_dict, load_config=None):
     """
     Loads state dict into a model specifically for Zero3, since DeepSpeed does not support the `transformers`
     tensor parallelism API.
     Nearly identical code to PyTorch's `_load_from_state_dict`
+    Args:
+        model_to_load: The model to load weights into
+        state_dict: The state dict containing the weights
+        load_config: Optional LoadStateDictConfig containing weight_mapping and other loading options
     """
     # copy state_dict so `_load_state_dict_into_zero3_model` can modify it
     metadata = getattr(state_dict, "_metadata", None)
@@ -303,6 +436,17 @@ def _load_state_dict_into_zero3_model(model_to_load, state_dict):
     if metadata is not None:
         state_dict._metadata = metadata
+    # Extract weight_mapping from load_config if provided
+    weight_mapping = None
+    if load_config is not None:
+        weight_mapping = getattr(load_config, "weight_mapping", None)
+    # Apply weight conversions if provided
+    if weight_mapping is not None and len(weight_mapping) > 0:
+        state_dict = _apply_weight_conversions_to_state_dict(model_to_load, state_dict, weight_mapping)
+        # Keep the current weight conversion mapping for later saving (in case it was coming directly from the user)
+        model_to_load._weight_conversions = weight_mapping
     error_msgs = []
     meta_model_state_dict = model_to_load.state_dict()
     missing_keys = set(meta_model_state_dict.keys())
@@ -405,8 +549,6 @@ def deepspeed_optim_sched(trainer, hf_deepspeed_config, args, num_training_steps
                 return lr_scheduler
             lr_scheduler = DummyScheduler(optimizer, lr_scheduler_callable=_lr_scheduler_callable)
-        else:
-            lr_scheduler = trainer.create_scheduler(num_training_steps=num_training_steps, optimizer=optimizer)
     return optimizer, lr_scheduler

transformers/integrations/executorch.py CHANGED Viewed

@@ -25,7 +25,6 @@ from ..generation.configuration_utils import GenerationConfig
 from ..modeling_utils import PreTrainedModel
 from ..pytorch_utils import (
     is_torch_greater_or_equal,
-    is_torch_greater_or_equal_than_2_3,
     is_torch_greater_or_equal_than_2_6,
 )
@@ -751,8 +750,6 @@ def convert_and_export_with_cache(
     Returns:
         Exported program (`torch.export.ExportedProgram`): The exported program generated via `torch.export`.
     """
-    if not is_torch_greater_or_equal_than_2_3:
-        raise ImportError("torch >= 2.3 is required.")
     import torch.export._trace
@@ -879,6 +876,7 @@ class Seq2SeqLMExportableModule(torch.nn.Module):
                 "batch_size": batch_size,
                 "max_cache_len": max_cache_length,
             },
+            eos_token_id=model.generation_config.eos_token_id,
         )
         self.exported_encoder = None
         self.exported_decoder = None
@@ -994,7 +992,7 @@ class Seq2SeqLMExportableModule(torch.nn.Module):
                 decoder_input_ids = torch.tensor([[next_token]], dtype=torch.long, device=model_device)
                 # Check if EOS token
-                if next_token == self.config.eos_token_id:
+                if next_token == self.generation_config.eos_token_id:
                     break
             return generated_ids
@@ -1016,8 +1014,6 @@ def export_with_dynamic_cache(
     Returns:
         Exported program (`torch.export.ExportedProgram`): The exported program generated via `torch.export`.
     """
-    if not is_torch_greater_or_equal_than_2_3:
-        raise ImportError("torch >= 2.3 is required.")
     register_dynamic_cache_export_support()

transformers/integrations/finegrained_fp8.py CHANGED Viewed

@@ -14,7 +14,7 @@
 from ..core_model_loading import ConversionOps
 from ..quantizers.quantizers_utils import should_convert_module
-from ..utils import is_torch_accelerator_available, is_torch_available, logging
+from ..utils import is_kernels_available, is_torch_accelerator_available, is_torch_available, logging
 if is_torch_available():
@@ -26,6 +26,64 @@ if is_torch_available():
 logger = logging.get_logger(__name__)
+# Global for the CUTLASS quantization kernel (lazily loaded)
+_quantization_kernel = None
+def _get_quantization_kernel():
+    """Lazily load the CUTLASS quantization kernel from HuggingFace Hub."""
+    global _quantization_kernel
+    if _quantization_kernel is None:
+        try:
+            from .hub_kernels import get_kernel
+            _quantization_kernel = get_kernel("RedHatAI/quantization")
+        except Exception as e:
+            logger.warning_once(f"Failed to load CUTLASS quantization kernel: {e}. Falling back to Triton.")
+            _quantization_kernel = False  # Mark as unavailable
+    return _quantization_kernel if _quantization_kernel else None
+def _supports_cutlass(block_size: list[int] | None, output_dtype: torch.dtype) -> bool:
+    """
+    Check if CUTLASS blockwise FP8 matmul is supported for the given block size and output dtype.
+    CUTLASS blockwise kernels require:
+    - SM90+ (Hopper or newer)
+    - Block size [128, 128] for weights
+    - Block size [1, 128] for activations (handled implicitly)
+    - Output dtype bfloat16 or float16
+    """
+    if not is_torch_available() or not torch.cuda.is_available() or not is_kernels_available():
+        return False
+    # CUTLASS only supports bfloat16/float16 output
+    if output_dtype not in (torch.bfloat16, torch.float16):
+        return False
+    # Check block size compatibility - CUTLASS only supports [128, 128]
+    if block_size is None:
+        return False
+    if len(block_size) != 2 or block_size[0] != 128 or block_size[1] != 128:
+        return False
+    # Check GPU capability (SM90+)
+    capability = torch.cuda.get_device_capability()
+    cuda_capability = capability[0] * 10 + capability[1]
+    # Try to load the kernel and check if blockwise FP8 is supported
+    kernel = _get_quantization_kernel()
+    if kernel is None:
+        return False
+    try:
+        return kernel.cutlass_scaled_mm_supports_block_fp8(cuda_capability)
+    except Exception:
+        return False
 try:
     _FP8_DTYPE = torch.float8_e4m3fn
     _FP8_MIN = torch.finfo(_FP8_DTYPE).min
@@ -338,6 +396,81 @@ def w8a8_block_fp8_matmul_triton(
     return C
+def w8a8_block_fp8_matmul(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    As: torch.Tensor,
+    Bs: torch.Tensor,
+    block_size: list[int],
+    output_dtype: torch.dtype = torch.float32,
+) -> torch.Tensor:
+    """
+    Dispatch to CUTLASS or Triton for block-wise FP8 matmul.
+    Uses CUTLASS when:
+    - Block size is [128, 128] (the only size CUTLASS supports)
+    - Running on SM90+ (Hopper or newer)
+    - The CUTLASS kernel is available
+    - Output dtype is bfloat16 or float16 (CUTLASS requirement)
+    - Tensor dimensions are compatible (divisible by 16)
+    Otherwise falls back to Triton.
+    """
+    if _supports_cutlass(block_size, output_dtype):
+        kernel = _get_quantization_kernel()
+        if kernel is not None:
+            try:
+                # CUTLASS expects:
+                # - A: [M, K] row-major, float8_e4m3fn
+                # - B: [K, N] column-major, float8_e4m3fn
+                # - As: [M, K//128] M-major (activation scales)
+                # - Bs: [K//128, N//128] K-major (weight scales)
+                # Reshape A to 2D if needed
+                original_shape = A.shape
+                M = A.numel() // A.shape[-1]
+                K = A.shape[-1]
+                N = B.shape[0]
+                # CUTLASS requires dimensions divisible by 16
+                if K % 16 != 0 or N % 16 != 0:
+                    raise ValueError(f"CUTLASS requires K ({K}) and N ({N}) divisible by 16")
+                A_2d = A.view(M, K).contiguous()
+                # B needs to be column-major for CUTLASS: [K, N] with stride(0)==1
+                # Our B is [N, K] row-major. Make it contiguous first, then transpose.
+                # B.contiguous() gives [N, K] with stride=(K,1)
+                # B.contiguous().t() gives [K, N] with stride=(1,K) which is column-major
+                # Do NOT call .contiguous() after .t() as it would make it row-major!
+                B_col_major = B.contiguous().t()
+                # Scales need proper layout for CUTLASS blockwise:
+                # As should be [M, K//128] with M-major layout (stride(0)==1)
+                # Bs should be [K//128, N//128] with K-major layout (stride(0)==1)
+                # As: reshape to [M, K//128], then make M-major via t().contiguous().t()
+                As_2d = As.view(M, -1).contiguous()
+                As_2d = As_2d.t().contiguous().t()  # [M, K//128] with stride(0)==1
+                # Bs: our input is [N//128, K//128], need [K//128, N//128] with stride(0)==1
+                # Transpose to get [K//128, N//128], then make K-major via t().contiguous().t()
+                Bs_km = Bs.contiguous().t()  # [K//128, N//128]
+                Bs_km = Bs_km.t().contiguous().t()  # Make K-major (stride(0)==1)
+                # Call CUTLASS kernel - it returns the output tensor
+                # Signature: cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias=None) -> Tensor
+                C = kernel.cutlass_scaled_mm(A_2d, B_col_major, As_2d, Bs_km, output_dtype, None)
+                # Reshape output back
+                C_shape = original_shape[:-1] + (N,)
+                return C.view(C_shape)
+            except Exception as e:
+                logger.warning_once(f"CUTLASS kernel failed: {e}. Falling back to Triton.")
+    # Fall back to Triton
+    return w8a8_block_fp8_matmul_triton(A, B, As, Bs, block_size, output_dtype)
 # Python version of the above triton function, it's much slower than the triton version, for testing
 @torch.compile
 def w8a8_block_fp8_matmul_compile(
@@ -463,7 +596,7 @@ class FP8Linear(nn.Linear):
                 else:
                     raise NotImplementedError("Not supported")
-                output = w8a8_block_fp8_matmul_triton(
+                output = w8a8_block_fp8_matmul(
                     qinput,
                     weight,
                     scale,
@@ -478,7 +611,6 @@ class FP8Linear(nn.Linear):
             if self.bias is not None:
                 output = output + self.bias
-            #            output = torch.nan_to_num(output, nan=0.0)
             return output.to(dtype=input.dtype)
@@ -493,9 +625,12 @@ class FP8Expert(nn.Module):
         from ..activations import ACT2FN
         self.block_size = block_size
-        self.num_experts = config.num_local_experts
+        # TODO we don't need exact expert count here but only in forward
+        self.num_experts = config.num_local_experts if hasattr(config, "num_local_experts") else config.num_experts
         self.hidden_dim = config.hidden_size
-        self.intermediate_dim = config.intermediate_size
+        self.intermediate_dim = (
+            config.moe_intermediate_size if hasattr(config, "moe_intermediate_size") else config.intermediate_size
+        )
         Wg_out, Wg_in = 2 * self.intermediate_dim, self.hidden_dim
         Wd_out, Wd_in = self.hidden_dim, self.intermediate_dim
@@ -544,7 +679,7 @@ class FP8Expert(nn.Module):
         for expert_idx in expert_hit:
             expert_idx = expert_idx[0]
-            if expert_idx == self.num_experts:
+            if expert_idx == len(self.gate_up_proj):  # weights will load fine
                 continue
             top_k_pos, token_idx = torch.where(expert_mask[expert_idx])
             current_state = hidden_states[token_idx]
@@ -571,7 +706,7 @@ class FP8Expert(nn.Module):
             torch_accelerator_module = getattr(torch, device_type, torch.cuda)
             with torch_accelerator_module.device(input.device):
                 qinput, scale = act_quant(input, self.block_size[1])
-                output = w8a8_block_fp8_matmul_triton(
+                output = w8a8_block_fp8_matmul(
                     qinput,
                     weight,
                     scale,

transformers/integrations/flash_attention.py CHANGED Viewed

@@ -13,12 +13,7 @@ def get_target_dtype(query: torch.Tensor, module: torch.nn.Module) -> torch.dtyp
     """If the query is in float32, return a target dtype compatible with flash attention. Return None otherwise."""
     if query.dtype == torch.float32:
         if torch.is_autocast_enabled():
-            # NOTE: `torch.get_autocast_dtype` is there starting from PyTorch 2.4
-            return (
-                torch.get_autocast_dtype("cuda")
-                if hasattr(torch, "get_autocast_dtype")
-                else torch.get_autocast_gpu_dtype()
-            )
+            return torch.get_autocast_dtype("cuda")
         # Handle the case where the model is quantized
         elif hasattr(module.config, "_is_quantized"):
             return module.config.dtype
@@ -42,7 +37,7 @@ def flash_attention_forward(
 ) -> tuple[torch.Tensor, None]:
     if kwargs.get("output_attentions", False):
         logger.warning_once(
-            "`flash_attention_2` does not support `output_attentions=True`."
+            "Flash Attention does not support `output_attentions=True`."
             " Please set your attention to `eager` if you want any of these features."
         )

transformers/integrations/hub_kernels.py CHANGED Viewed

@@ -152,13 +152,19 @@ try:
                     layer_name="MegaBlocksMoeMLP",
                 )
             },
+            "xpu": {
+                Mode.INFERENCE: LayerRepository(
+                    repo_id="kernels-community/megablocks",
+                    layer_name="MegaBlocksMoeMLP",
+                )
+            },
         },
         "FastGELU": {
             "cuda": {
                 Mode.INFERENCE | Mode.TORCH_COMPILE: LayerRepository(
                     repo_id="kernels-community/activation",
                     layer_name="FastGELU",
-                    version=">=0.0.4,<0.1.0",
+                    version=1,
                 )
             }
         },
@@ -167,7 +173,7 @@ try:
                 Mode.INFERENCE | Mode.TORCH_COMPILE: LayerRepository(
                     repo_id="kernels-community/activation",
                     layer_name="QuickGELU",
-                    version=">=0.0.4,<0.1.0",
+                    version=1,
                 )
             }
         },
@@ -176,28 +182,28 @@ try:
                 Mode.INFERENCE | Mode.TORCH_COMPILE: LayerRepository(
                     repo_id="kernels-community/activation",
                     layer_name="NewGELU",
-                    version=">=0.0.4,<0.1.0",
+                    version=1,
                 )
             }
         },
         "SiLU": {
             "cuda": {
                 Mode.INFERENCE | Mode.TORCH_COMPILE: LayerRepository(
-                    repo_id="kernels-community/activation", layer_name="Silu", version=">=0.1.0"
+                    repo_id="kernels-community/activation", layer_name="Silu", version=1
                 )
             }
         },
         "GeLU": {
             "cuda": {
                 Mode.INFERENCE | Mode.TORCH_COMPILE: LayerRepository(
-                    repo_id="kernels-community/activation", layer_name="Gelu", version=">=0.1.0"
+                    repo_id="kernels-community/activation", layer_name="Gelu", version=1
                 )
             }
         },
         "GeluTanh": {
             "cuda": {
                 Mode.INFERENCE | Mode.TORCH_COMPILE: LayerRepository(
-                    repo_id="kernels-community/activation", layer_name="GeluTanh", version=">=0.1.0"
+                    repo_id="kernels-community/activation", layer_name="GeluTanh", version=1
                 )
             }
         },
@@ -210,7 +216,12 @@ try:
                 Mode.INFERENCE: FuncRepository(
                     repo_id="kernels-community/rotary", func_name="apply_rotary_transformers"
                 )
-            }
+            },
+            "cuda": {
+                Mode.INFERENCE: FuncRepository(
+                    repo_id="kernels-community/rotary", func_name="apply_rotary_transformers"
+                )
+            },
         }
     def has_key(d, key):

transformers 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl

transformers 5.0.0rc3py3-none-any.whl → 5.1.0py3-none-any.whl