PyPI - transformers - Versions diffs - 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl - Mend

transformers 5.0.0rc1py3-none-any.whl → 5.0.0rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (671) hide show

transformers/integrations/accelerate.py CHANGED Viewed

@@ -21,7 +21,6 @@ import inspect
 import os
 import re
 from collections import OrderedDict, defaultdict
-from contextlib import contextmanager
 from typing import TYPE_CHECKING
 from safetensors import safe_open
@@ -55,114 +54,6 @@ if TYPE_CHECKING:
 logger = logging.get_logger(__name__)
-@contextmanager
-def init_empty_weights(include_buffers: bool = False):
-    """
-    A context manager under which models are initialized with all parameters on the meta device, therefore creating an
-    empty model. Useful when just initializing the model would blow the available RAM.
-    Args:
-        include_buffers (`bool`, *optional*):
-            Whether or not to also put all buffers on the meta device while initializing.
-    Example:
-    ```python
-    import torch.nn as nn
-    from accelerate import init_empty_weights
-    # Initialize a model with 100 billions parameters in no time and without using any RAM.
-    with init_empty_weights():
-        tst = nn.Sequential(*[nn.Linear(10000, 10000) for _ in range(1000)])
-    ```
-    <Tip warning={true}>
-    Any model created under this context manager has no weights. As such you can't do something like
-    `model.to(some_device)` with it. To load weights inside your empty model, see [`load_checkpoint_and_dispatch`].
-    Make sure to overwrite the default device_map param for [`load_checkpoint_and_dispatch`], otherwise dispatch is not
-    called.
-    </Tip>
-    """
-    with init_on_device(torch.device("meta"), include_buffers=include_buffers) as f:
-        yield f
-@contextmanager
-def init_on_device(device: "torch.device", include_buffers: bool = False):
-    """
-    A context manager under which models are initialized with all parameters on the specified device.
-    Args:
-        device (`torch.device`):
-            Device to initialize all parameters on.
-        include_buffers (`bool`, *optional*):
-            Whether or not to also put all buffers on the meta device while initializing.
-    Example:
-    ```python
-    import torch.nn as nn
-    from accelerate import init_on_device
-    with init_on_device(device=torch.device("cuda")):
-        tst = nn.Linear(100, 100)  # on `cuda` device
-    ```
-    """
-    if include_buffers:
-        with device:
-            yield
-        return
-    old_register_parameter = nn.Module.register_parameter
-    if include_buffers:
-        old_register_buffer = nn.Module.register_buffer
-    def register_empty_parameter(module, name, param):
-        old_register_parameter(module, name, param)
-        if param is not None:
-            param_cls = type(module._parameters[name])
-            kwargs = module._parameters[name].__dict__
-            kwargs["requires_grad"] = param.requires_grad
-            module._parameters[name] = param_cls(module._parameters[name].to(device), **kwargs)
-    def register_empty_buffer(module, name, buffer, persistent=True):
-        old_register_buffer(module, name, buffer, persistent=persistent)
-        if buffer is not None:
-            module._buffers[name] = module._buffers[name].to(device)
-    # Patch tensor creation
-    if include_buffers:
-        tensor_constructors_to_patch = {
-            torch_function_name: getattr(torch, torch_function_name)
-            for torch_function_name in ["empty", "zeros", "ones", "full"]
-        }
-    else:
-        tensor_constructors_to_patch = {}
-    def patch_tensor_constructor(fn):
-        def wrapper(*args, **kwargs):
-            kwargs["device"] = device
-            return fn(*args, **kwargs)
-        return wrapper
-    try:
-        nn.Module.register_parameter = register_empty_parameter
-        if include_buffers:
-            nn.Module.register_buffer = register_empty_buffer
-        for torch_function_name in tensor_constructors_to_patch:
-            setattr(torch, torch_function_name, patch_tensor_constructor(getattr(torch, torch_function_name)))
-        yield
-    finally:
-        nn.Module.register_parameter = old_register_parameter
-        if include_buffers:
-            nn.Module.register_buffer = old_register_buffer
-        for torch_function_name, old_torch_function in tensor_constructors_to_patch.items():
-            setattr(torch, torch_function_name, old_torch_function)
 def check_and_set_device_map(device_map: "torch.device | int | str | dict | None") -> dict | str | None:
     from ..modeling_utils import get_torch_context_manager_or_global_device
@@ -182,6 +73,10 @@ def check_and_set_device_map(device_map: "torch.device | int | str | dict | None
         device_map = {"": device_map}
     elif isinstance(device_map, str) and device_map not in ["auto", "balanced", "balanced_low_0", "sequential"]:
         try:
+            if device_map == "cuda":
+                # setting to the local rank
+                local_rank = int(os.environ.get("LOCAL_RANK", 0))
+                device_map = f"cuda:{local_rank}"
             device_map = {"": torch.device(device_map)}
         except RuntimeError:
             raise ValueError(
@@ -398,7 +293,7 @@ def _get_device_map(
         # especially if the model uses WeightConverter (because there will be some uncontrollable cpu memory spikes during
         # the conversions before we resave the weights). In those cases, it's better to offload to disk a bit more
         # if we were in-between, as otherwise we blow-up cpu memory
-        if max_memory is None:
+        if max_memory is None and "cpu" in inferred_max_memory:
             inferred_max_memory["cpu"] *= 0.90
         if hf_quantizer is not None:
@@ -458,10 +353,13 @@ def accelerate_dispatch(model, hf_quantizer, device_map, offload_folder, offload
         dispatch_model(model, **device_map_kwargs)
-def expand_device_map(device_map, param_names):
+def expand_device_map(device_map: dict | None, param_names: list[str]):
     """
     Expand a device map to return the correspondence parameter name to device.
     """
+    if device_map is None:
+        return dict.fromkeys(param_names, "cpu")
     # Here, we first sort by number of submodules, then length of the full string, to make sure to match correctly
     device_map_regex = re.compile(
         "|".join(rf"({k})" for k in sorted(device_map.keys(), key=lambda x: (x.count("."), len(x)), reverse=True))
@@ -474,6 +372,15 @@ def expand_device_map(device_map, param_names):
     return new_device_map
+def get_device(device_map: dict | None, param_name: str, valid_torch_device: bool = False) -> torch.device | str | int:
+    """Return the device on which `param_name` should be according to the `device_map`. If `valid_torch_device` is `True`,
+    then if the device is `"disk"`, `"cpu"` will be returned instead."""
+    device = expand_device_map(device_map, [param_name])[param_name]
+    if valid_torch_device and device == "disk":
+        return "cpu"
+    return device
 def accelerate_disk_offload(
     model: "PreTrainedModel",
     disk_offload_folder: str | None,
@@ -554,6 +461,32 @@ def offload_weight(weight: torch.Tensor, weight_name: str, offload_folder: str |
     return offload_index
+def load_offloaded_parameter(model: "PreTrainedModel", param_name: str) -> torch.Tensor:
+    """Load `param_name` from disk, if it was offloaded due to the device_map, and thus lives as a meta parameter
+    inside `model`.
+    This is needed when resaving a model, when some parameters were offloaded (we need to load them from disk, to
+    then resave them to disk in the correct shard...)."""
+    # Start from the most inner module, and try to find the hook that was used for offloading the param
+    module_parts = param_name.split(".")
+    modules_to_check = [".".join(module_parts[:-idx]) for idx in range(1, len(module_parts))] + [""]
+    for parent_name in modules_to_check:
+        parent = model.get_submodule(parent_name)
+        if hasattr(parent, "_hf_hook"):
+            weights_map = parent._hf_hook.weights_map
+            truncated_param_name = param_name.replace(f"{parent_name}." if parent_name != "" else parent_name, "")
+            break
+    # If we did not break the loop, something is wrong
+    else:
+        raise ValueError(
+            f"{param_name} is on the meta device because it was offloaded, but we could not find "
+            "the corresponding hook for it"
+        )
+    # This call loads it from disk
+    tensor = weights_map[truncated_param_name]
+    return tensor
 def _init_infer_auto_device_map(
     model: nn.Module,
     max_memory: dict[int | str, int | str] | None = None,

transformers/integrations/aqlm.py CHANGED Viewed

@@ -14,13 +14,11 @@
 "AQLM (Additive Quantization of Language Model) integration file"
 from ..quantizers.quantizers_utils import should_convert_module
-from ..utils import is_accelerate_available, is_torch_available, logging
+from ..utils import is_torch_available, logging
-if is_accelerate_available():
-    from accelerate import init_empty_weights
 if is_torch_available():
+    import torch
     import torch.nn as nn
 logger = logging.get_logger(__name__)
@@ -46,7 +44,7 @@ def replace_with_aqlm_linear(model, modules_to_not_convert: list[str] | None = N
     for module_name, module in model.named_modules():
         if not should_convert_module(module_name, modules_to_not_convert):
             continue
-        with init_empty_weights():
+        with torch.device("meta"):
             if isinstance(module, nn.Linear):
                 new_module = QuantizedLinear(
                     module.in_features,

transformers/integrations/awq.py CHANGED Viewed

@@ -16,12 +16,9 @@
 from typing import Optional, Union
 from ..quantizers.quantizers_utils import should_convert_module
-from ..utils import is_accelerate_available, is_torch_available, logging
+from ..utils import is_torch_available, logging
-if is_accelerate_available():
-    from accelerate import init_empty_weights
 if is_torch_available():
     import torch
     import torch.nn as nn
@@ -97,7 +94,7 @@ def replace_with_awq_linear(
     for module_name, module in model.named_modules():
         if not should_convert_module(module_name, modules_to_not_convert):
             continue
-        with init_empty_weights():
+        with torch.device("meta"):
             if isinstance(module, nn.Linear):
                 new_module = target_cls(
                     bits=quantization_config.bits,

transformers/integrations/bitnet.py CHANGED Viewed

@@ -1,10 +1,7 @@
 from ..quantizers.quantizers_utils import should_convert_module
-from ..utils import is_accelerate_available, is_torch_available, logging
+from ..utils import is_torch_available, logging
-if is_accelerate_available():
-    from accelerate import init_empty_weights
 if is_torch_available():
     import torch
     import torch.nn as nn
@@ -92,7 +89,7 @@ def unpack_weights(packed: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
     Explanation of the example:
     ---------------------------
-    Let's take the first value for example 0b10100001, we we will only focus on the first column,
+    Let's take the first value for example 0b10100001, we will only focus on the first column,
     because every element is unpacked across the first dimension
     - First 2 bits: `01` → 0 at [0][0]
     - Second 2 bits: `00` → -1 at [0][2]
@@ -173,7 +170,7 @@ class BitLinear(nn.Module):
         Activation function : Performs symmetric, per-token quantization on the input activations.
         Parameters:
         -----------
-        x : torch.Tensor
+        input : torch.Tensor
             Input activations to be quantized.
         num_bits : int, optional (default=8)
             Number of bits to use for quantization, determining the quantization range.
@@ -334,7 +331,7 @@ def replace_with_bitnet_linear(model, modules_to_not_convert: list[str] | None =
     for module_name, module in model.named_modules():
         if not should_convert_module(module_name, modules_to_not_convert):
             continue
-        with init_empty_weights():
+        with torch.device("meta"):
             if isinstance(module, nn.Linear):
                 if quantization_config and quantization_config.linear_class == "autobitlinear":
                     new_module = AutoBitLinear(
@@ -365,7 +362,7 @@ def replace_with_bitnet_linear(model, modules_to_not_convert: list[str] | None =
     if not has_been_replaced:
         logger.warning(
-            "You are loading your model using eetq but no linear modules were found in your model."
+            "You are loading your model using bitnet but no linear modules were found in your model."
             " Please double check your model architecture, or submit an issue on github if you think this is"
             " a bug."
         )

transformers/integrations/bitsandbytes.py CHANGED Viewed

@@ -22,7 +22,6 @@ if is_torch_available():
 if is_accelerate_available():
     import accelerate
-    from accelerate import init_empty_weights
     from accelerate.hooks import add_hook_to_module, remove_hook_from_module
 logger = logging.get_logger(__name__)
@@ -181,7 +180,7 @@ def replace_with_bnb_linear(
         if not should_convert_module(module_name, modules_to_not_convert):
             continue
         new_module = None
-        with init_empty_weights():
+        with torch.device("meta"):
             if isinstance(module, (nn.Linear, Conv1D)):
                 if isinstance(module, Conv1D):
                     in_features, out_features = module.weight.shape
@@ -233,7 +232,7 @@ def replace_with_bnb_linear(
 # Copied from PEFT: https://github.com/huggingface/peft/blob/47b3712898539569c02ec5b3ed4a6c36811331a1/src/peft/utils/integrations.py#L41
-def dequantize_bnb_weight(weight: "torch.nn.Parameter", dtype: "torch.dtype", state=None):
+def dequantize_bnb_weight(weight: "torch.nn.Parameter", state=None):
     """
     Helper function to dequantize 4bit or 8bit bnb weights.
@@ -248,10 +247,7 @@ def dequantize_bnb_weight(weight: "torch.nn.Parameter", dtype: "torch.dtype", st
     if cls_name == "Params4bit":
         output_tensor = bnb.functional.dequantize_4bit(weight.data, weight.quant_state)
-        logger.warning_once(
-            f"The model is going to be dequantized in {output_tensor.dtype} - if you want to upcast it to another dtype, make sure to pass the desired dtype when quantizing the model through `bnb_4bit_quant_type` argument of `BitsAndBytesConfig`"
-        )
-        return output_tensor.to(dtype)
+        return output_tensor
     if state.SCB is None:
         state.SCB = weight.SCB
@@ -263,7 +259,7 @@ def dequantize_bnb_weight(weight: "torch.nn.Parameter", dtype: "torch.dtype", st
         # Multiply by (scale/127) to dequantize.
         dequantized = weight.data * state.SCB.view(-1, 1) * 7.874015718698502e-3
-    return dequantized.to(dtype)
+    return dequantized
 def _create_accelerate_new_hook(old_hook):
@@ -283,10 +279,7 @@ def _create_accelerate_new_hook(old_hook):
     return new_hook
-def dequantize_and_replace(
-    model,
-    quantization_config=None,
-):
+def dequantize_and_replace(model, quantization_config=None, dtype=None):
     """
     Converts a quantized model into its dequantized original version. The newly converted model will have
     some performance drop compared to the original model before quantization - use it only for specific usecases
@@ -297,14 +290,22 @@ def dequantize_and_replace(
     quant_method = quantization_config.quantization_method()
     target_cls = bnb.nn.Linear8bitLt if quant_method == "llm_int8" else bnb.nn.Linear4bit
     for module_name, module in model.named_modules():
         if isinstance(module, target_cls):
-            with init_empty_weights():
+            with torch.device("meta"):
                 bias = getattr(module, "bias", None)
                 new_module = torch.nn.Linear(module.in_features, module.out_features, bias=bias is not None)
             state = module.state if quant_method == "llm_int8" else None
-            new_module.weight = torch.nn.Parameter(dequantize_bnb_weight(module.weight, model.dtype, state))
+            new_module.weight = torch.nn.Parameter(dequantize_bnb_weight(module.weight, state))
+            weight = dequantize_bnb_weight(module.weight, state)
+            if dtype is None:
+                logger.warning_once(
+                    f"The modules are dequantized in {weight.dtype}. If you want to change the dtype, please specify `dtype` in `dequantize`. "
+                )
+            else:
+                logger.warning_once(f"The modules are dequantized in {weight.dtype} and casted to {dtype}.")
+                weight = weight.to(dtype)
+            new_module.weight = torch.nn.Parameter(weight)
             if bias is not None:
                 new_module.bias = bias
             if hasattr(module, "_hf_hook"):

transformers/integrations/deepspeed.py CHANGED Viewed

@@ -304,6 +304,15 @@ def _load_state_dict_into_zero3_model(model_to_load, state_dict):
         state_dict._metadata = metadata
     error_msgs = []
+    meta_model_state_dict = model_to_load.state_dict()
+    missing_keys = set(meta_model_state_dict.keys())
+    prefix_model = getattr(model_to_load, "base_model_prefix", None)
+    # take care of the case where in the checkpoint we don't have the prefix
+    state_dict = {
+        (f"{prefix_model}.{k}" if meta_model_state_dict.get(f"{prefix_model}.{k}") is not None else k): v
+        for k, v in state_dict.items()
+    }
     # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants
     # so we need to apply the function recursively.
@@ -320,7 +329,14 @@ def _load_state_dict_into_zero3_model(model_to_load, state_dict):
             # In sharded models, each shard has only part of the full state_dict, so only gather
             # parameters that are in the current state_dict.
             named_parameters = dict(module.named_parameters(prefix=prefix[:-1], recurse=False))
-            params_to_gather = [named_parameters[k] for k in named_parameters if k in state_dict]
+            params_to_gather = []
+            for k in named_parameters:
+                if k in state_dict:
+                    param = named_parameters[k]
+                    # crutial to not init the weight again
+                    param._is_hf_initialized = True
+                    params_to_gather.append(param)
+                    missing_keys.discard(k)
             if len(params_to_gather) > 0:
                 # because zero3 puts placeholders in model params, this context
@@ -333,11 +349,10 @@ def _load_state_dict_into_zero3_model(model_to_load, state_dict):
         for name, child in module._modules.items():
             if child is not None:
                 load(child, state_dict, prefix + name + ".", assign_to_params_buffers)
-                child._is_hf_initialized = True
     load(model_to_load, state_dict, assign_to_params_buffers=False)
-    return error_msgs
+    return error_msgs, missing_keys
 def deepspeed_optim_sched(trainer, hf_deepspeed_config, args, num_training_steps, model_parameters):

transformers/integrations/eetq.py CHANGED Viewed

@@ -14,15 +14,13 @@
 # limitations under the License.
 from ..core_model_loading import ConversionOps
 from ..quantizers.quantizers_utils import should_convert_module
-from ..utils import is_accelerate_available, is_torch_available, logging
+from ..utils import is_torch_available, logging
 if is_torch_available():
     import torch
     import torch.nn as nn
-if is_accelerate_available():
-    from accelerate import init_empty_weights
 logger = logging.get_logger(__name__)
@@ -97,7 +95,7 @@ def replace_with_eetq_linear(model, modules_to_not_convert: list[str] | None = N
             Names of the modules to not convert in `EetqLinear`. In practice we keep the `lm_head` in full precision
             for numerical stability reasons.
     """
-    from kernels import get_kernel
+    from .hub_kernels import get_kernel
     global eetq_kernels_hub
     eetq_kernels_hub = get_kernel("kernels-community/quantization-eetq")
@@ -108,7 +106,7 @@ def replace_with_eetq_linear(model, modules_to_not_convert: list[str] | None = N
     for module_name, module in model.named_modules():
         if not should_convert_module(module_name, modules_to_not_convert):
             continue
-        with init_empty_weights():
+        with torch.device("meta"):
             if isinstance(module, nn.Linear):
                 new_module = EetqLinear(
                     module.in_features, module.out_features, bias=module.bias is not None, **module_kwargs

transformers/integrations/fbgemm_fp8.py CHANGED Viewed

@@ -257,7 +257,7 @@ class FbgemmFp8Llama4TextExperts(nn.Module):
 @lru_cache(maxsize=1)
 def get_quantize_fp8_per_row():
     if _is_torch_xpu_available:
-        from kernels import get_kernel
+        from .hub_kernels import get_kernel
         return get_kernel("kernels-community/fp8-fbgemm").quantize_fp8_per_row
     return torch.ops.fbgemm.quantize_fp8_per_row

transformers/integrations/finegrained_fp8.py CHANGED Viewed

@@ -15,7 +15,7 @@
 from ..core_model_loading import ConversionOps
 from ..quantizers.quantizers_utils import should_convert_module
-from ..utils import is_accelerate_available, is_torch_accelerator_available, is_torch_available, logging
+from ..utils import is_torch_accelerator_available, is_torch_available, logging
 if is_torch_available():
@@ -25,23 +25,16 @@ if is_torch_available():
     import triton.language as tl
     from torch.nn import functional as F
-if is_accelerate_available():
-    from accelerate import init_empty_weights
 logger = logging.get_logger(__name__)
 try:
     _FP8_DTYPE = torch.float8_e4m3fn
     _FP8_MIN = torch.finfo(_FP8_DTYPE).min
     _FP8_MAX = torch.finfo(_FP8_DTYPE).max
-    _FP8_IS_INT = False
 except AttributeError:
-    _FP8_DTYPE = torch.int8
-    _FP8_MIN, _FP8_MAX = -127, 127
-    _FP8_IS_INT = True
-    logger.warning_once(
-        "torch.float8_e4m3fn not available; falling back to int8 emulation for Fp8Quantize operations."
-    )
+    _FP8_DTYPE = None
+    _FP8_MIN, _FP8_MAX = -448, 448
+    logger.warning_once("torch.float8_e4m3fn not available")
 # Copied from https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/main/inference/kernel.py
@@ -618,7 +611,7 @@ def replace_with_fp8_linear(
         # we need this to correctly materialize the weights during quantization
         module_kwargs = {} if pre_quantized else {"dtype": None}
         new_module = None
-        with init_empty_weights():
+        with torch.device("meta"):
             if module_name.endswith(".experts"):
                 new_module = FP8Expert(
                     config=model.config, block_size=quantization_config.weight_block_size, **module_kwargs
@@ -701,10 +694,7 @@ class Fp8Quantize(ConversionOps):
         scales_broadcast = scales.unsqueeze(-1).unsqueeze(-3)  # -> (..., rows_tiles, 1, cols_tiles, 1)
         scaled = reshaped * scales_broadcast
-        if _FP8_IS_INT:
-            quantized = torch.clamp(scaled.round(), min=_FP8_MIN, max=_FP8_MAX).to(_FP8_DTYPE)
-        else:
-            quantized = torch.clamp(scaled, min=_FP8_MIN, max=_FP8_MAX).to(_FP8_DTYPE)
+        quantized = torch.clamp(scaled, min=_FP8_MIN, max=_FP8_MAX).to(_FP8_DTYPE)
         quantized = quantized.reshape(original_shape)

transformers/integrations/flash_attention.py CHANGED Viewed

@@ -20,8 +20,8 @@ def get_target_dtype(query: torch.Tensor, module: torch.nn.Module) -> torch.dtyp
                 else torch.get_autocast_gpu_dtype()
             )
         # Handle the case where the model is quantized
-        elif hasattr(module.config, "_pre_quantization_dtype"):
-            return module.config._pre_quantization_dtype
+        elif hasattr(module.config, "quantization_config"):
+            return module.config.dtype
         else:
             return next(layer for layer in module.modules() if isinstance(layer, torch.nn.Linear)).weight.dtype
     return None

transformers/integrations/higgs.py CHANGED Viewed

@@ -16,12 +16,9 @@
 from math import sqrt
 from ..quantizers.quantizers_utils import should_convert_module
-from ..utils import is_accelerate_available, is_flute_available, is_hadamard_available, is_torch_available, logging
+from ..utils import is_flute_available, is_hadamard_available, is_torch_available, logging
-if is_accelerate_available():
-    from accelerate import init_empty_weights
 if is_torch_available():
     import torch
     import torch.nn as nn
@@ -569,7 +566,7 @@ def replace_with_higgs_linear(model, modules_to_not_convert: list[str] | None =
     for module_name, module in model.named_modules():
         if not should_convert_module(module_name, modules_to_not_convert):
             continue
-        with init_empty_weights():
+        with torch.device("meta"):
             if isinstance(module, nn.Linear):
                 new_module = HiggsLinear(
                     module.in_features,

transformers/integrations/hub_kernels.py CHANGED Viewed

@@ -11,11 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import importlib.metadata
 import os
 import re
 from collections.abc import Callable
 from types import ModuleType
+from packaging import version as pkg_version
 from ..utils import ENV_VARS_TRUE_VALUES, logging
 from ..utils.import_utils import is_kernels_available
 from .flash_attention import flash_attention_forward
@@ -28,10 +31,12 @@ try:
         Device,
         LayerRepository,
         Mode,
-        get_kernel,
         register_kernel_mapping,
         replace_kernel_forward_from_hub,
     )
+    from kernels import (
+        get_kernel as get_kernel_hub,
+    )
     from kernels import (
         use_kernel_forward_from_hub as _kernels_use_kernel_forward_from_hub,
     )
@@ -340,8 +345,6 @@ def lazy_load_kernel(kernel_name: str, mapping: dict[str, ModuleType | None] = _
         mapping[kernel_name] = None
         return None
     if _kernels_available:
-        from kernels import get_kernel
         try:
             repo_id = _HUB_KERNEL_MAPPING[kernel_name]["repo_id"]
             revision = _HUB_KERNEL_MAPPING[kernel_name].get("revision", None)
@@ -370,7 +373,7 @@ def lazy_load_kernel(kernel_name: str, mapping: dict[str, ModuleType | None] = _
         if callable(is_kernel_available) and is_kernel_available():
             # Try to import the module "{kernel_name}" from parent package level
             try:
-                module = importlib.import_module(f"{kernel_name}")
+                module = importlib.import_module(f"{new_kernel_name}")
                 mapping[kernel_name] = module
                 return module
             except Exception:
@@ -381,6 +384,20 @@ def lazy_load_kernel(kernel_name: str, mapping: dict[str, ModuleType | None] = _
     return mapping[kernel_name]
+def get_kernel(kernel_name: str, revision: str | None = None, version: str | None = None) -> ModuleType:
+    from .. import __version__
+    user_agent = {"framework": "transformers", "version": __version__, "repo_id": kernel_name}
+    if _kernels_available:
+        kernels_version = importlib.metadata.version("kernels")
+        if pkg_version.parse(kernels_version) >= pkg_version.parse("0.10.4"):
+            return get_kernel_hub(kernel_name, revision=revision, version=version, user_agent=user_agent)
+        else:
+            return get_kernel_hub(kernel_name, revision=revision)
+    else:
+        raise ImportError("kernels is not installed, please install it with `pip install kernels`")
 def use_kernelized_func(module_names: list[Callable] | Callable):
     """
     This decorator attaches the target function as an attribute of the module.
@@ -415,5 +432,6 @@ __all__ = [
     "register_kernel_mapping_transformers",
     "replace_kernel_forward_from_hub",
     "lazy_load_kernel",
+    "get_kernel",
     "use_kernelized_func",
-]
+]  # type: ignore

transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl

transformers 5.0.0rc1py3-none-any.whl → 5.0.0rc2py3-none-any.whl