PyPI - transformers - Versions diffs - 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl - Mend

transformers 5.0.0rc1py3-none-any.whl → 5.0.0rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (671) hide show

transformers/modeling_gguf_pytorch_utils.py CHANGED Viewed

@@ -63,6 +63,24 @@ class TensorProcessor:
     def __init__(self, config=None):
         self.config = config or {}
+    def preprocess_name(self, hf_name: str) -> str:
+        """
+        Preprocesses the tensor name to ease loading the GGUF tensors.
+        """
+        return hf_name
+    def perform_fallback_tensor_mapping(
+        self, gguf_to_hf_name_map: dict[str, str], suffix: str, qual_name: str, hf_name: str
+    ):
+        """
+        Called when get_gguf_hf_weights_map fails to map a HF parameter
+        (tensor) and corresponding GGUF one.
+        This is particularly useful to resolve one-to-many
+        HF-GGUF mappings sometimes appear in some MoE models.
+        """
+        pass
     def process(self, weights, name, **kwargs):
         return GGUFTensor(weights, name, {})
@@ -98,15 +116,31 @@ class LlamaTensorProcessor(TensorProcessor):
 class Qwen2MoeTensorProcessor(TensorProcessor):
+    HF_EXPERT_RENAME_PATTERN = re.compile(r"mlp.experts.\d+.")
+    HF_MOE_W13_PATTERN = re.compile(r"model\.layers\.(?P<bid>\d+)\.mlp\.experts\.gate_up_proj")
+    GGUF_MOE_WEIGHTS_PATTERN = re.compile(r"(?P<name>.*\.ffn_(?P<w>gate|down|up)_exps)\.weight$")
     def __init__(self, config=None):
         super().__init__(config=config)
-    def process(self, weights, name, **kwargs):
-        if "_exp" in name:
+    def preprocess_name(self, hf_name: str) -> str:
+        return re.sub(self.HF_EXPERT_RENAME_PATTERN, "mlp.experts.", hf_name)
+    def perform_fallback_tensor_mapping(
+        self, gguf_to_hf_name_map: dict[str, str], suffix: str, qual_name: str, hf_name: str
+    ):
+        # Map merged MoE weights (w1 (gate) and w3 (up)) separately.
+        if m := re.fullmatch(self.HF_MOE_W13_PATTERN, hf_name):
+            full_hf_name = qual_name + hf_name
+            gguf_to_hf_name_map[f"blk.{m['bid']}.ffn_gate_exps{suffix}"] = full_hf_name
+            gguf_to_hf_name_map[f"blk.{m['bid']}.ffn_up_exps{suffix}"] = full_hf_name
+    def process(self, weights, name: str, **kwargs):
+        if m := re.fullmatch(self.GGUF_MOE_WEIGHTS_PATTERN, name):
             tensor_key_mapping = kwargs.get("tensor_key_mapping")
             parsed_parameters = kwargs.get("parsed_parameters")
             if tensor_key_mapping:
-                self._split_moe_expert_tensor(weights, parsed_parameters, name, tensor_key_mapping)
+                self._set_moe_expert_tensor(weights, parsed_parameters, tensor_key_mapping[m["name"]], m["w"])
                 return GGUFTensor(weights, None, {})
         if "ffn_gate_inp_shexp" in name:
             # for compatibility tensor shared_expert_gate must be (1, 2048) dim,
@@ -114,17 +148,27 @@ class Qwen2MoeTensorProcessor(TensorProcessor):
             weights = np.expand_dims(weights, axis=0)
         return GGUFTensor(weights, name, {})
-    def _split_moe_expert_tensor(
-        self, weights: np.ndarray, parsed_parameters: dict[str, dict], name: str, tensor_key_mapping: dict
-    ):
-        # Original merge implementation
-        # https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py#L1994-L2022
-        name = tensor_key_mapping[name]
-        w_counter = self.config.get("num_experts", 60)
-        for i in range(0, w_counter):
-            temp_name = name.replace("mlp.experts.", f"mlp.experts.{i}.")
-            exp_weight = weights[i]
-            parsed_parameters["tensors"][temp_name] = torch.from_numpy(np.copy(exp_weight))
+    def _set_moe_expert_tensor(self, weights: np.ndarray, parsed_parameters: dict[str, dict], hf_name: str, w: str):
+        torch_weights = torch.from_numpy(np.copy(weights))
+        if w == "down":
+            parsed_parameters["tensors"][hf_name] = torch_weights
+        else:
+            # Double the size of the second dimension to interleave w1 (gate) and w3 (up)
+            # weights per expert (which is the first dimension).
+            # w1 (gate) comes first and w3 (up) comes second.
+            # ref: https://github.com/vllm-project/vllm/blob/8f8fda261a620234fdeea338f44093d5d8072879/vllm/model_executor/layers/fused_moe/layer.py#L988-L1015
+            shape = list(weights.shape)
+            shard_dim = 1
+            shard_size = shape[shard_dim]
+            shape[shard_dim] = shard_size * 2
+            if hf_name not in parsed_parameters["tensors"]:
+                parsed_parameters["tensors"][hf_name] = torch.zeros(shape, dtype=torch_weights.dtype)
+            out: torch.Tensor = parsed_parameters["tensors"][hf_name]
+            if w == "gate":
+                out = out.narrow(shard_dim, 0, shard_size)
+            else:  # w == "up"
+                out = out.narrow(shard_dim, shard_size, shard_size)
+            out.copy_(torch_weights)
 class BloomTensorProcessor(TensorProcessor):
@@ -281,6 +325,7 @@ def read_field(reader, field):
 # modified from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/model_executor/model_loader/loader.py#L1115-L1147
 def get_gguf_hf_weights_map(
     hf_model,
+    processor: TensorProcessor,
     model_type: Optional[str] = None,
     num_layers: Optional[int] = None,
     qual_name: str = "",
@@ -334,9 +379,7 @@ def get_gguf_hf_weights_map(
     gguf_to_hf_name_map = {}
     state_dict = hf_model.state_dict()
     for hf_name in state_dict:
-        # An exception for qwen2moe/qwen3moe model, where the expert layers are packed
-        if model_type in ("qwen2moe", "qwen3moe") and "mlp.experts." in hf_name:
-            hf_name = re.sub(r"mlp.experts.\d+.", "mlp.experts.", hf_name)
+        hf_name = processor.preprocess_name(hf_name)
         name, suffix = hf_name, ""
         if hf_name.endswith(".weight") or hf_name.endswith(".bias"):
@@ -345,6 +388,7 @@ def get_gguf_hf_weights_map(
         gguf_name = name_map.get_name(name)
         if gguf_name is None:
+            processor.perform_fallback_tensor_mapping(gguf_to_hf_name_map, suffix, qual_name, hf_name)
             continue
         gguf_to_hf_name_map[gguf_name + suffix] = qual_name + hf_name
@@ -353,7 +397,9 @@ def get_gguf_hf_weights_map(
     # Therefore, we need to check submodule as well to get a correct mapping
     if named_children := hf_model.named_children():
         for name, child in named_children:
-            sub_map = get_gguf_hf_weights_map(child, model_type, num_layers, qual_name=f"{qual_name}{name}.")
+            sub_map = get_gguf_hf_weights_map(
+                child, processor, model_type, num_layers, qual_name=f"{qual_name}{name}."
+            )
             # Ignore the keys that are already in the main map to avoid overwriting
             sub_map = {k: v for k, v in sub_map.items() if k not in gguf_to_hf_name_map}
             gguf_to_hf_name_map.update(sub_map)
@@ -507,12 +553,13 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False, model_to_lo
     if return_tensors:
         parsed_parameters["tensors"] = {}
-        tensor_key_mapping = get_gguf_hf_weights_map(model_to_load)
         config = parsed_parameters.get("config", {})
         ProcessorClass = TENSOR_PROCESSORS.get(architecture, TensorProcessor)
         processor = ProcessorClass(config=config)
+        tensor_key_mapping = get_gguf_hf_weights_map(model_to_load, processor)
         for tensor in tqdm(reader.tensors, desc="Converting and de-quantizing GGUF tensors..."):
             name = tensor.name
             weights = dequantize(tensor.data, tensor.tensor_type)

transformers/modeling_rope_utils.py CHANGED Viewed

@@ -46,17 +46,19 @@ def dynamic_rope_update(rope_forward):
     def longrope_frequency_update(self, position_ids, device, layer_type=None):
         """Longrope uses long factor if sequence is larger than original pretraining length, short otherwise."""
         seq_len = torch.max(position_ids) + 1
-        original_max_position_embeddings = getattr(
-            self.config, "original_max_position_embeddings", self.config.max_position_embeddings
-        )
         if layer_type is None:
             rope_type = self.rope_type
             original_inv_freq = self.original_inv_freq
             prefix = ""
+            original_max_position_embeddings = self.config.rope_parameters["original_max_position_embeddings"]
         else:
             rope_type = self.rope_type[layer_type]
             original_inv_freq = getattr(self, f"{layer_type}_original_inv_freq")
             prefix = f"{layer_type}_"
+            original_max_position_embeddings = self.config.rope_parameters[layer_type][
+                "original_max_position_embeddings"
+            ]
         if seq_len > original_max_position_embeddings:
             if not hasattr(self, f"{layer_type}_long_inv_freq"):
@@ -223,7 +225,6 @@ def _compute_dynamic_ntk_parameters(
         Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
         post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
     """
-    # TODO (joao): use the new `original_max_position_embeddings` from rope_parameters
     # For backward compatibility standardize the `rope_parameters_dict` if it uses old format
     config.standardize_rope_params()
     rope_parameters_dict = config.rope_parameters[layer_type] if layer_type is not None else config.rope_parameters
@@ -232,30 +233,29 @@ def _compute_dynamic_ntk_parameters(
     partial_rotary_factor = rope_parameters_dict.get("partial_rotary_factor", 1.0)
     head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
     dim = int(head_dim * partial_rotary_factor)
-    max_position_embeddings = config.max_position_embeddings
     factor = rope_parameters_dict["factor"]
     attention_factor = 1.0  # Unused in this type of RoPE
     # seq_len: default to max_position_embeddings, e.g. at init time
     if seq_len is None:
-        seq_len = max_position_embeddings
+        seq_len = config.max_position_embeddings
     elif isinstance(seq_len, torch.Tensor):
         seq_len = torch.maximum(
             seq_len,
-            torch.tensor(max_position_embeddings, dtype=seq_len.dtype, device=seq_len.device),
+            torch.tensor(config.max_position_embeddings, dtype=seq_len.dtype, device=seq_len.device),
         )
     else:
-        seq_len = max(seq_len, max_position_embeddings)
+        seq_len = max(seq_len, config.max_position_embeddings)
     # Compute the inverse frequencies
-    base = base * ((factor * seq_len / max_position_embeddings) - (factor - 1)) ** (dim / (dim - 2))
+    base = base * ((factor * seq_len / config.max_position_embeddings) - (factor - 1)) ** (dim / (dim - 2))
     inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim))
     return inv_freq, attention_factor
 def _compute_yarn_parameters(
     config: "PreTrainedConfig",
-    device: "torch.device",
+    device: Optional["torch.device"] = None,
     seq_len: Optional[int] = None,
     layer_type: Optional[str] = None,
 ) -> tuple["torch.Tensor", float]:
@@ -292,8 +292,7 @@ def _compute_yarn_parameters(
                     `mscale_all_dim` are provided, `mscale_all_dim` acts scalar augmenting `log(factor)` when computing
                     the denominator for the inferred value of `attention_factor`. If not provided, `attention_factor`
                     will be calculated based on `factor` only.
-                *   `original_max_position_embeddings` (`int`, *optional*): The original max position embeddings used
-                    during pretraining. If not provided, the function falls back to `max_position_embeddings`.
+                *   `original_max_position_embeddings` (`int`): The original max position embeddings used during pretraining.
                 *   `truncate` (`bool`, *optional*): Whether to truncate the correction range.
             Additionally, this function will make use of the following properties if they are found in the config:
@@ -324,15 +323,13 @@ def _compute_yarn_parameters(
     attention_factor = rope_parameters_dict.get("attention_factor")
     mscale = rope_parameters_dict.get("mscale")
     mscale_all_dim = rope_parameters_dict.get("mscale_all_dim")
+    original_max_position_embeddings = rope_parameters_dict["original_max_position_embeddings"]
-    # NOTE: DeekSeek-V3 (and potentially other models) modify `max_position_embeddings` and have a
-    # `original_max_position_embeddings` field containing the pretrained value. They use the ratio between these two
-    # values to compute the default attention scaling factor, instead of using `factor`.
-    if "original_max_position_embeddings" in rope_parameters_dict:
-        original_max_position_embeddings = rope_parameters_dict["original_max_position_embeddings"]
+    # NOTE: DeekSeek-V3 (and potentially other models) have `original_max_position_embeddings` field
+    # containing the pretrained value. They use the ratio between `max_position_embeddings` and this value
+    # to compute the default attention scaling factor, instead of using `factor`.
+    if factor is None:
         factor = config.max_position_embeddings / original_max_position_embeddings
-    else:
-        original_max_position_embeddings = config.max_position_embeddings
     def get_mscale(scale, mscale=1):
         if scale <= 1:
@@ -393,7 +390,7 @@ def _compute_yarn_parameters(
 def _compute_longrope_parameters(
     config: "PreTrainedConfig",
-    device: "torch.device",
+    device: Optional["torch.device"] = None,
     seq_len: Optional[int] = None,
     layer_type: Optional[str] = None,
 ) -> tuple["torch.Tensor", float]:
@@ -440,7 +437,6 @@ def _compute_longrope_parameters(
         Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
         post-processing scaling factor applied to the computed cos/sin.
     """
-    # TODO (joao): use the new `original_max_position_embeddings` from rope_parameters
     # For backward compatibility standardize the `rope_parameters_dict` if it uses old format
     config.standardize_rope_params()
     rope_parameters_dict = config.rope_parameters[layer_type] if layer_type is not None else config.rope_parameters
@@ -454,14 +450,13 @@ def _compute_longrope_parameters(
     short_factor = rope_parameters_dict["short_factor"]
     factor = rope_parameters_dict.get("factor")
     attention_factor = rope_parameters_dict.get("attention_factor")
+    original_max_position_embeddings = rope_parameters_dict["original_max_position_embeddings"]
     # NOTE: Phi3 (and potentially other models) modify `max_position_embeddings` and have a
     # `original_max_position_embeddings` field containing the pretrained value. They use the ratio between these two
     # values to compute the default attention scaling factor, instead of using `factor`.
-    if original_max_position_embeddings := getattr(config, "original_max_position_embeddings", None):
+    if factor is None:
         factor = config.max_position_embeddings / original_max_position_embeddings
-    else:
-        original_max_position_embeddings = config.max_position_embeddings
     # Sets the attention factor as suggested in the paper
     if attention_factor is None:
@@ -483,7 +478,7 @@ def _compute_longrope_parameters(
 def _compute_llama3_parameters(
     config: "PreTrainedConfig",
-    device: "torch.device",
+    device: Optional["torch.device"] = None,
     seq_len: Optional[int] = None,
     layer_type: Optional[str] = None,
 ) -> tuple["torch.Tensor", float]:
@@ -587,7 +582,7 @@ class RopeParameters(TypedDict, total=False):
             most scaling types, a `factor` of x will enable the model to handle sequences of length x *
             original maximum pre-trained length.
         original_max_position_embeddings (`int`, *optional*):
-            Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+            Used with 'yarn', 'longrope' and 'llama3'. The original max position embeddings used during
             pretraining.
         attention_factor (`float`, *optional*):
             Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
@@ -641,6 +636,7 @@ class RotaryEmbeddingConfigMixin:
         # Standardize and validate the correctness of rotary position embeddings parameters
         self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", self.default_theta))
         if "partial_rotary_factor" in kwargs:
             self.rope_parameters.setdefault("partial_rotary_factor", kwargs["partial_rotary_factor"])
             ignore_keys_at_rope_validation = {"partial_rotary_factor"}
@@ -671,14 +667,30 @@ class RotaryEmbeddingConfigMixin:
             rope_parameters.setdefault("rope_theta", rope_theta)
             if partial_rotary_factor is not None:
                 rope_parameters["partial_rotary_factor"] = partial_rotary_factor
+            # Move pretraining-time maximum length to rope parameter dict for RoPE types with scaling
+            if rope_parameters["rope_type"] in ["llama3", "yarn", "longrope"]:
+                if hasattr(self, "original_max_position_embeddings"):
+                    # NOTE: Phi3 (and potentially other models) save `original_max_position_embeddings` field
+                    # containing the pretrained value outside rope parameters. This is an exception case where we
+                    # give priority to `self.original_max_position_embeddings
+                    self.rope_parameters["original_max_position_embeddings"] = self.original_max_position_embeddings
+                else:
+                    self.rope_parameters.setdefault("original_max_position_embeddings", self.max_position_embeddings)
         # Case 2: different RoPE for each layer -> several params as nested dict
         else:
-            for layer_type in layer_types:
+            for layer_type in set(layer_types):
                 rope_parameters[layer_type].setdefault("rope_type", rope_parameters[layer_type].get("type", "default"))
                 rope_parameters[layer_type].setdefault("rope_theta", rope_theta)
                 if partial_rotary_factor is not None:
                     rope_parameters[layer_type]["partial_rotary_factor"] = partial_rotary_factor
+                if rope_parameters[layer_type]["rope_type"] in ["llama3", "yarn", "longrope"]:
+                    self.rope_parameters[layer_type].setdefault(
+                        "original_max_position_embeddings", self.max_position_embeddings
+                    )
         self.rope_parameters = rope_parameters
     def validate_rope(self: "PreTrainedConfig", ignore_keys: Optional[set] = None):
@@ -725,26 +737,24 @@ class RotaryEmbeddingConfigMixin:
             logger.warning(f"`rope_parameters`'s factor field must be a float >= 1, got {factor}")
     def _validate_dynamic_rope_parameters(self, rope_parameters: dict, ignore_keys: Optional[set] = None):
-        # TODO (joao): update logic for the inclusion of `original_max_position_embeddings`
-        optional_keys = {"original_max_position_embeddings"}
         required_keys = {"rope_type", "factor"}
         received_keys = set(rope_parameters.keys())
         rope_type = rope_parameters["rope_type"]
-        self._check_received_keys(rope_type, received_keys, required_keys, optional_keys, ignore_keys=ignore_keys)
+        self._check_received_keys(rope_type, received_keys, required_keys, ignore_keys=ignore_keys)
         factor = rope_parameters["factor"]
         if factor is None or not isinstance(factor, float) or factor < 1.0:
             logger.warning(f"`rope_parameters`'s factor field must be a float >= 1, got {factor}")
     def _validate_yarn_rope_parameters(self, rope_parameters: dict, ignore_keys: Optional[set] = None):
-        required_keys = {"rope_type", "factor", "rope_theta"}
+        required_keys = {"rope_type", "factor", "rope_theta", "original_max_position_embeddings"}
         optional_keys = {
             "attention_factor",
             "beta_fast",
             "beta_slow",
-            "original_max_position_embeddings",
             "mscale",
             "mscale_all_dim",
+            "truncate",
         }
         received_keys = set(rope_parameters.keys())
         rope_type = rope_parameters["rope_type"]
@@ -772,37 +782,24 @@ class RotaryEmbeddingConfigMixin:
                 f"(defaults to 32 if None) and beta_slow={beta_slow} (defaults to 1 if None)"
             )
-        # Models should set `config.rope_parameters["original_max_position_embeddings"]` to their original (pre-yarn) context
-        # length, with `config.max_position_embeddings` corresponding to their post-yarn context length.
-        # However, for BC purposes, we allow the former to be unset.
-        original_max_position_embeddings = self.rope_parameters.get("original_max_position_embeddings")
-        if original_max_position_embeddings is not None:
-            # Double-check: `factor` should be the ratio between the pre-yarn and post-yarn context lengths.
-            implicit_factor = self.max_position_embeddings / original_max_position_embeddings
-            if implicit_factor != factor:
-                logger.warning_once(
-                    f"The explicitly set RoPE scaling factor (config.rope_parameters['factor'] = {factor}) does not match "
-                    "the ratio implicitly set by other parameters (implicit factor = "
-                    "post-yarn context length / pre-yarn context length = "
-                    "config.max_position_embeddings / config.rope_parameters['original_max_position_embeddings'] = "
-                    f"{implicit_factor}). Using the explicit factor ({factor}) in YaRN. This may cause unexpected "
-                    "behaviour in model usage, please correct the 'max_position_embeddings' fields in the model config."
-                )
-        # No `config.rope_parameters["original_max_position_embeddings"]`. Is `config.max_position_embeddings` the
-        # pre-yarn or the post-yarn context length?
-        # BC: we assume it is the pre-yarn context length.
-        else:
+        # Double-check: `factor` should be the ratio between the pre-yarn and post-yarn context lengths.
+        # NOTE: we might get `implicit_factor == 1` if config's `original_max_position_embeddings` was
+        # inferred from `max_position_embeddings` during standardization
+        original_max_position_embeddings = self.rope_parameters["original_max_position_embeddings"]
+        implicit_factor = self.max_position_embeddings / original_max_position_embeddings
+        if implicit_factor != factor and implicit_factor != 1:
             logger.warning_once(
-                "config.rope_parameters['original_max_position_embeddings'], the pre-yarn context length, is unset. We will "
-                "**assume** config.max_position_embeddings holds the pre-yarn context length. Some use cases may expect "
-                "config.max_position_embeddings to hold the post-yarn context length (pre-yarn context length * "
-                "factor) -- we recommend updating both fields for optimal downstream model usage."
+                f"The explicitly set RoPE scaling factor (config.rope_parameters['factor'] = {factor}) does not match "
+                "the ratio implicitly set by other parameters (implicit factor = "
+                "post-yarn context length / pre-yarn context length = "
+                "config.max_position_embeddings / config.rope_parameters['original_max_position_embeddings'] = "
+                f"{implicit_factor}). Using the explicit factor ({factor}) in YaRN. This may cause unexpected "
+                "behaviour in model usage, please correct the 'original_max_position_embeddings' fields in the model config."
             )
     def _validate_longrope_rope_parameters(self, rope_parameters: dict, ignore_keys: Optional[set] = None):
-        required_keys = {"rope_type", "short_factor", "long_factor", "rope_theta"}
-        # TODO (joao): update logic for the inclusion of `original_max_position_embeddings`
-        optional_keys = {"attention_factor", "factor", "original_max_position_embeddings"}
+        required_keys = {"rope_type", "short_factor", "long_factor", "rope_theta", "original_max_position_embeddings"}
+        optional_keys = {"attention_factor", "factor"}
         received_keys = set(rope_parameters.keys())
         rope_type = rope_parameters["rope_type"]
         self._check_received_keys(rope_type, received_keys, required_keys, optional_keys, ignore_keys=ignore_keys)
@@ -827,29 +824,28 @@ class RotaryEmbeddingConfigMixin:
                 f"`rope_parameters`'s long_factor field must have length {dim // 2}, got {len(long_factor)}"
             )
-        # Handle Phi3 divergence: prefer the use of `attention_factor` and/or `factor` over
-        # `original_max_position_embeddings` to compute internal variables. The latter lives outside `rope_parameters` and is
-        # unique to longrope (= undesirable)
-        if hasattr(self, "original_max_position_embeddings"):
+        factor = rope_parameters.get("factor")
+        original_max_position_embeddings = rope_parameters["original_max_position_embeddings"]
+        # Handle Phi3 divergence: we prefer the use of `attention_factor` and/or `factor` over
+        # `original_max_position_embeddings` to compute internal variables. The latter is undesirable
+        if factor is None and original_max_position_embeddings is not None:
             logger.warning_once(
-                "This model has set a `original_max_position_embeddings` field, to be used together with "
+                "This model config has set a `rope_parameters['original_max_position_embeddings']` field, to be used together with "
                 "`max_position_embeddings` to determine a scaling factor. Please set the `factor` field of `rope_parameters`"
                 "with this ratio instead -- we recommend the use of this field over `original_max_position_embeddings`, "
                 "as it is compatible with most model architectures."
             )
-        else:
-            factor = rope_parameters.get("factor")
-            if factor is None:
-                logger.warning("Missing required keys in `rope_parameters`: 'factor'")
-            elif not isinstance(factor, float) or factor < 1.0:
-                logger.warning(f"`rope_parameters`'s factor field must be a float >= 1, got {factor}")
-            attention_factor = rope_parameters.get("attention_factor")
-            if attention_factor is not None:
-                if not isinstance(attention_factor, float) or attention_factor < 0.0:
-                    logger.warning(
-                        f"`rope_parameters`'s attention_factor field must be a float greater than 0, got {attention_factor}"
-                    )
+        elif factor is None and original_max_position_embeddings is None:
+            logger.warning("Missing required keys in `rope_parameters`: 'factor'")
+        elif not isinstance(factor, float) or factor < 1.0:
+            logger.warning(f"`rope_parameters`'s factor field must be a float >= 1, got {factor}")
+        attention_factor = rope_parameters.get("attention_factor")
+        if attention_factor is not None and (not isinstance(attention_factor, float) or attention_factor < 0.0):
+            logger.warning(
+                f"`rope_parameters`'s attention_factor field must be a float greater than 0, got {attention_factor}"
+            )
     def _validate_llama3_rope_parameters(self, rope_parameters: dict, ignore_keys: Optional[set] = None):
         required_keys = {
@@ -906,6 +902,10 @@ class RotaryEmbeddingConfigMixin:
             received_keys -= {"type"}
             required_keys.add("rope_type")
+        optional_keys = optional_keys or set()
+        if "partial_rotary_factor" not in optional_keys:
+            optional_keys.add("partial_rotary_factor")
         # Some models need to store model-specific keys, and we don't want to throw warning at them
         if ignore_keys is not None:
             received_keys -= ignore_keys
@@ -914,10 +914,7 @@ class RotaryEmbeddingConfigMixin:
         if missing_keys:
             raise KeyError(f"Missing required keys in `rope_parameters` for 'rope_type'='{rope_type}': {missing_keys}")
-        if optional_keys is not None:
-            unused_keys = received_keys - required_keys - optional_keys
-        else:
-            unused_keys = received_keys - required_keys
+        unused_keys = received_keys - required_keys - optional_keys
         if unused_keys:
             logger.warning(f"Unrecognized keys in `rope_parameters` for 'rope_type'='{rope_type}': {unused_keys}")

transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl

transformers 5.0.0rc1py3-none-any.whl → 5.0.0rc2py3-none-any.whl