PyPI - paddlex - Versions diffs - 3.0.0rc1__py3-none-any.whl → 3.0.2__py3-none-any.whl - Mend

paddlex 3.0.0rc1py3-none-any.whl → 3.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (240) hide show

paddlex/inference/models/common/vlm/fusion_ops.py ADDED Viewed

@@ -0,0 +1,205 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import paddle
+import paddle.nn.functional as F
+try:
+    from paddle.incubate.nn.functional import fused_rotary_position_embedding
+except ImportError:
+    fused_rotary_position_embedding = None
+try:
+    from paddle.incubate.nn.functional import swiglu
+except ImportError:
+    def swiglu(x, y=None):
+        if y is None:
+            x, y = paddle.chunk(x, chunks=2, axis=-1)
+        return F.silu(x) * y
+from paddle.utils import try_import
+def get_env_device():
+    """
+    Return the device name of running environment.
+    """
+    if paddle.is_compiled_with_cuda():
+        return "gpu"
+    elif "npu" in paddle.device.get_all_custom_device_type():
+        return "npu"
+    elif "mlu" in paddle.device.get_all_custom_device_type():
+        return "mlu"
+    elif "gcu" in paddle.device.get_all_custom_device_type():
+        return "gcu"
+    elif "intel_hpu" in paddle.device.get_all_custom_device_type():
+        return "intel_hpu"
+    elif paddle.is_compiled_with_rocm():
+        return "rocm"
+    elif paddle.is_compiled_with_xpu():
+        return "xpu"
+    return "cpu"
+try:
+    from paddle.incubate.nn.functional import fused_rotary_position_embedding
+except ImportError:
+    fused_rotary_position_embedding = None
+try:
+    if get_env_device() in ["npu", "mlu", "gcu"]:
+        from paddle.base import core
+        for lib in os.listdir(os.getenv("CUSTOM_DEVICE_ROOT")):
+            if lib.endswith(".so"):
+                paddle.utils.cpp_extension.extension_utils.load_op_meta_info_and_register_op(
+                    lib
+                )
+    from paddle.nn.functional.flash_attention import flash_attention
+except:
+    flash_attention = None
+def fusion_rope(
+    query_states,
+    key_states,
+    value_states,
+    hidden_states,
+    position_ids,
+    past_key_value,
+    rotary_emb,
+    context_parallel_degree=-1,
+):
+    if get_env_device() not in ["gcu", "intel_hpu"]:
+        assert past_key_value is None, "fuse rotary not support cache kv for now"
+    batch_size, seq_length, num_heads, head_dim = query_states.shape
+    _, kv_seq_len, num_key_value_heads, _ = key_states.shape
+    if context_parallel_degree > 1:
+        assert (
+            get_env_device() == "gpu"
+        ), "context parallel only support cuda device for now"
+        kv_seq_len *= context_parallel_degree
+    if get_env_device() not in ["gcu", "intel_hpu"]:
+        cos, sin = rotary_emb(value_states, seq_len=kv_seq_len)
+    if get_env_device() == "npu":
+        query_states = core.eager._run_custom_op("fused_rope", query_states, cos, sin)[
+            0
+        ]
+        key_states = core.eager._run_custom_op("fused_rope", key_states, cos, sin)[0]
+    elif get_env_device() == "intel_hpu":
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-3]
+        cos, sin = rotary_emb(value_states, seq_len=kv_seq_len)
+        cos = cos.squeeze().unsqueeze(0).unsqueeze(0)
+        sin = sin.squeeze().unsqueeze(0).unsqueeze(0)
+        query_states, _, _ = (
+            paddle.incubate.nn.functional.fused_rotary_position_embedding(
+                paddle.transpose(query_states, [0, 2, 1, 3]),
+                None,
+                None,
+                sin=sin,
+                cos=cos,
+                position_ids=position_ids,
+            )
+        )
+        key_states, _, _ = (
+            paddle.incubate.nn.functional.fused_rotary_position_embedding(
+                paddle.transpose(key_states, [0, 2, 1, 3]),
+                None,
+                None,
+                sin=sin,
+                cos=cos,
+                position_ids=position_ids,
+            )
+        )
+        query_states = paddle.transpose(query_states, [0, 2, 1, 3])
+        key_states = paddle.transpose(key_states, [0, 2, 1, 3])
+    elif get_env_device() == "gcu":
+        cos_sin = rotary_emb.get_fused_cos_sin(value_states, seq_len=kv_seq_len)
+        query_states, key_states = core.eager._run_custom_op(
+            "fused_rotary_embedding_gcu",
+            query_states,
+            key_states,
+            cos_sin,
+            position_ids,
+            True,
+        )
+    else:
+        # paddle version > 2.6 or develop support q and k/v with different num_heads
+        paddle_version = float(paddle.__version__[:3])
+        if ((paddle_version != 0.0) and (paddle_version <= 2.6)) and (
+            num_heads != num_key_value_heads
+        ):
+            query_states, _, _ = fused_rotary_position_embedding(
+                query_states,
+                None,
+                None,
+                sin=sin,
+                cos=cos,
+                position_ids=position_ids,
+                use_neox_rotary_style=False,
+            )
+            key_states, _, _ = fused_rotary_position_embedding(
+                key_states,
+                None,
+                None,
+                sin=sin,
+                cos=cos,
+                position_ids=position_ids,
+                use_neox_rotary_style=False,
+            )
+        else:
+            query_states, key_states, _ = fused_rotary_position_embedding(
+                query_states,
+                key_states,
+                v=None,
+                sin=sin,
+                cos=cos,
+                position_ids=position_ids,
+                use_neox_rotary_style=False,
+            )
+    return query_states, key_states
+def rms_norm_fused(x_in, w, eps, use_fast_ln=False):
+    if use_fast_ln:
+        fast_ln = try_import("fast_ln")
+        return fast_ln.fast_rms_norm(x_in, w, eps)[0]
+    else:
+        fused_ln = try_import("fused_ln")
+        return fused_ln.fused_rms_norm(x_in, w, eps)[0]
+def fusion_rms_norm(hidden_states, weight, variance_epsilon, use_fast_ln=False):
+    if get_env_device() == "npu":
+        return core.eager._run_custom_op(
+            "rms_norm_npu", hidden_states, weight, variance_epsilon
+        )[0]
+    if get_env_device() == "mlu":
+        return core.eager._run_custom_op(
+            "rms_norm_mlu", hidden_states, weight, variance_epsilon
+        )[0]
+    elif get_env_device() == "gcu":
+        return core.eager._run_custom_op(
+            "rms_norm_gcu", hidden_states, weight, variance_epsilon
+        )[0]
+    elif get_env_device() == "intel_hpu":
+        return paddle.incubate.nn.functional.fused_rms_norm(
+            hidden_states, weight, None, variance_epsilon, hidden_states.dim() - 1
+        )[0]
+    return rms_norm_fused(hidden_states, weight, variance_epsilon, use_fast_ln)

paddlex/inference/models/common/vlm/generation/configuration_utils.py CHANGED Viewed

@@ -88,7 +88,7 @@ class GenerationConfig:
             use_fast: (bool, optional): Whether to use fast entry of model
                 for FastGeneration. Default to False.
             use_fp16_decoding: (bool, optional): Whether to use fp16 for decoding.
-                Only works when fast entry is avalible. Default to False.
+                Only works when fast entry is available. Default to False.
             trunc_input: (bool, optional): Whether to truncate the inputs from
                 output sequences . Default to True.
             model_kwargs (dict): It can be used to specify additional kwargs

paddlex/inference/models/common/vlm/generation/logits_process.py CHANGED Viewed

@@ -487,7 +487,7 @@ class SequenceBiasLogitsProcessor(LogitsProcessor):
         self._validate_arguments()
         # Bias variables that will be populated on the first call (for retrocompatibility purposes, the vocabulary size
-        # is infered in the first usage, which inhibits initializing here)
+        # is inferred in the first usage, which inhibits initializing here)
         self.length_1_bias = None
         self.prepared_bias_variables = False

paddlex/inference/models/common/vlm/generation/utils.py CHANGED Viewed

@@ -1443,7 +1443,7 @@ class GenerationMixin(object):
             next_tokens = paddle.multinomial(probs)
             if self.config.tensor_parallel_degree > 1:
-                # Maybe no need to broadcast if seed is set correclty.
+                # Maybe no need to broadcast if seed is set correctly.
                 from paddle.distributed import fleet
                 try:

paddlex/inference/models/common/vlm/transformers/configuration_utils.py CHANGED Viewed

@@ -496,7 +496,7 @@ class PretrainedConfig:
             if num_labels is not None and len(self.id2label) != num_labels:
                 logging.warning(
                     f"You passed along `num_labels={num_labels}` with an incompatible id to label map: "
-                    f"{self.id2label}. The number of labels wil be overwritten to {self.num_labels}."
+                    f"{self.id2label}. The number of labels will be overwritten to {self.num_labels}."
                 )
             self.id2label = dict(
                 (int(key), value) for key, value in self.id2label.items()
@@ -909,7 +909,7 @@ class PretrainedConfig:
     def register_unsavable_keys(self, keys):
         # Save: not save it in any case
-        # Print: show it if non defalut value
+        # Print: show it if non default value
         if isinstance(keys, list) or isinstance(keys, tuple):
             for key in keys:
                 self._unsavable_keys.add(key)
@@ -939,7 +939,7 @@ class PretrainedConfig:
             output[key] = value
-        # Fix for rewrited from_pretrained method, hasattr
+        # Fix for rewrote from_pretrained method, hasattr
         if saving_file and hasattr(self, "_unsavable_keys"):
             for key in list(output.keys()):
                 if key in self._unsavable_keys:

paddlex/inference/models/common/vlm/transformers/conversion_utils.py CHANGED Viewed

@@ -51,7 +51,7 @@ class StateDictNameMapping:
         return self.action == "transpose"
     def should_merge_last_two_dim(self) -> bool:
-        """check that wether merge last two dim"""
+        """check that whether merge last two dim"""
         return self.action == "merge_last_two_dim"
     def run(self, state_dict: dict[str, ndarray], name: str) -> ndarray:
@@ -104,7 +104,7 @@ class StateDictNameMapping:
 class ConversionMixin:
     @classmethod
     def support_conversion(cls, config: PretrainedConfig) -> bool:
-        """check wether the model support conversion"""
+        """check whether the model support conversion"""
         try:
             # try to get the name-mapping info
             _ = cls._get_name_mappings(config)
@@ -166,7 +166,7 @@ class ConversionMixin:
             with device_guard("cpu"):
                 state_dict = paddle.load(weight_file, return_numpy=False)
             logging.info(
-                "Starting to convert orignal state_dict to tensor parallel state_dict."
+                "Starting to convert original state_dict to tensor parallel state_dict."
             )
         state_keys_map = cls._resolve_prefix_keys(

paddlex/inference/models/common/vlm/transformers/model_outputs.py CHANGED Viewed

@@ -298,11 +298,11 @@ def _transformer_encoder_fwd(
     # MultiHeadAttention not so efficiently, and maybe optimize it later.
     if cache is None and getattr(self, "_use_cache", False):
         cache = [tuple(self.layers[0].gen_cache(src))] * len(self.layers)
-    # To be compatible with `TransformerEncoder.forward`, `_use_cache` defualts
+    # To be compatible with `TransformerEncoder.forward`, `_use_cache` defaults
     # to True when cache is not None.
     new_caches = [] if cache is not None and getattr(self, "_use_cache", True) else None
     all_attentions = [] if output_attentions else None
-    # NOTE: Also includes embeding output which is same as HF.
+    # NOTE: Also includes embedding output which is same as HF.
     all_hidden_states = [output] if output_hidden_states else None
     for i, mod in enumerate(self.layers):
         # if output has no gradient, recompute is unnecessary

paddlex/inference/models/common/vlm/transformers/model_utils.py CHANGED Viewed

@@ -185,7 +185,7 @@ def _convert_state_dict_dtype_and_shape(state_dict, model_to_load):
         if key in list(state_dict.keys()):
             if isinstance(state_dict[key], np.ndarray):
                 raise ValueError(
-                    "convert_state_dict_dtype expected paddle.Tensor not numpy.ndarray, plase convert numpy.ndarray to paddle.Tensor"
+                    "convert_state_dict_dtype expected paddle.Tensor not numpy.ndarray, please convert numpy.ndarray to paddle.Tensor"
                 )
             # confirm parameter cast is executed on the same device as model
             # TODO: cast(FP32 -> FP16) has diff on different devices, need to fix it
@@ -1080,7 +1080,7 @@ class PretrainedModel(
                 elif "pytorch_model.bin" in str(resolved_archive_file):
                     if not from_hf_hub and not convert_from_torch:
                         raise ValueError(
-                            f"Download pytorch wight in "
+                            f"Download pytorch weight in "
                             f" {resolved_archive_file}. Please set convert_from_torch=True in from_pretrained. eg, Model.from_pretrained(model_name, convert_from_torch=True) "
                         )
@@ -1488,7 +1488,7 @@ class PretrainedModel(
                         dtype,
                     )
-                # Mistmatched keys contains tuples key/shape1/shape2 of weights in the checkpoint that have a shape not
+                # Mismatched keys contains tuples key/shape1/shape2 of weights in the checkpoint that have a shape not
                 # matching the weights in the model.
                 mismatched_keys += _find_mismatched_keys(
                     state_dict,
@@ -1681,14 +1681,14 @@ class PretrainedModel(
             )
             convert_from_torch = True
-        # from_hf_hub defalut enable convert_from_torch
+        # from_hf_hub default enable convert_from_torch
         if from_hf_hub and convert_from_torch is None:
             logging.warning(
                 "If you are attempting to load weights from Hugging Face Hub and want to disable the default behavior of considering torch weights,"
                 " you can set ·convert_from_torch=False·. By default, `convert_from_torch` is set to `True`. "
             )
             convert_from_torch = True
-        # convert_from_torch defalut is False
+        # convert_from_torch default is False
         if convert_from_torch is None:
             convert_from_torch = False
@@ -1922,7 +1922,7 @@ class PretrainedModel(
                         assert (
                             k
                             not in final_config["mp_config"]["parallelize_plan"].keys()
-                        ), f"sublayer mp_config shuld be a subset of model but got sublayer config {config['mp_config']} and model config {final_config['mp_config']}."
+                        ), f"sublayer mp_config should be a subset of model but got sublayer config {config['mp_config']} and model config {final_config['mp_config']}."
                         final_config["mp_config"]["parallelize_plan"][k] = v
             if "sp_config" in config and config["sp_config"] is not None:
                 if final_config["sp_config"] is None:
@@ -1932,7 +1932,7 @@ class PretrainedModel(
                         assert (
                             k
                             not in final_config["sp_config"]["parallelize_plan"].keys()
-                        ), f"sublayer sp_config shuld be a subset of model but got sublayer config {config['sp_config']} and model config {final_config['sp_config']}."
+                        ), f"sublayer sp_config should be a subset of model but got sublayer config {config['sp_config']} and model config {final_config['sp_config']}."
                         final_config["sp_config"]["parallelize_plan"][k] = v
             if "pp_config" in config and config["pp_config"] is not None:
                 if isinstance(config["pp_config"]["split_spec"], str):
@@ -2011,28 +2011,4 @@ class PretrainedModel(
             merged_config["pp_config"] is not None
             final_config["pp_config"] = merged_config["pp_config"]
-        if (
-            "data_sharding_parallel" in auto_dist_degree
-            and auto_dist_degree["data_sharding_parallel"]
-        ):
-            # to avoid a circular import
-            from paddlenlp.trainer.trainer_utils import ShardingOption
-            level = 0
-            if (
-                "sharding" in auto_dist_degree
-                and auto_dist_degree["sharding"] is not None
-            ):
-                sharding = auto_dist_degree["sharding"]
-                if ShardingOption.SHARD_OP in sharding:
-                    level = 1
-                if ShardingOption.SHARD_GRAD_OP in sharding:
-                    level = 2
-                if ShardingOption.FULL_SHARD in sharding:
-                    level = 3
-            final_config["dp_config"] = {
-                "sharding_level": level,
-                "sharding_mesh_dim": auto_dist_degree.get("sharding_mesh_dim", None),
-            }
         return final_config

paddlex 3.0.0rc1__py3-none-any.whl → 3.0.2__py3-none-any.whl

paddlex 3.0.0rc1py3-none-any.whl → 3.0.2py3-none-any.whl