PyPI - paddlex - Versions diffs - 3.0.0rc1__py3-none-any.whl → 3.0.1__py3-none-any.whl - Mend

paddlex 3.0.0rc1py3-none-any.whl → 3.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (233) hide show

paddlex/inference/models/common/tokenizer/qwen_tokenizer.py ADDED Viewed

@@ -0,0 +1,288 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import base64
+import importlib.util
+import os
+import unicodedata
+from typing import Collection, Dict, List, Set, Tuple, Union
+from .tokenizer_utils import PretrainedTokenizer
+from .tokenizer_utils_base import AddedToken
+__all__ = ["QWenTokenizer"]
+VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken"}
+PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
+ENDOFTEXT = "<|endoftext|>"
+IMSTART = "<|im_start|>"
+IMEND = "<|im_end|>"
+# as the default behavior is changed to allow special tokens in
+# regular texts, the surface forms of special tokens need to be
+# as different as possible to minimize the impact
+EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
+SPECIAL_TOKENS = (
+    ENDOFTEXT,
+    IMSTART,
+    IMEND,
+) + EXTRAS
+tiktoken = None
+def is_tiktoken_available():
+    return importlib.util.find_spec("tiktoken") is not None
+def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
+    with open(tiktoken_bpe_file, "rb") as f:
+        contents = f.read()
+    return {
+        base64.b64decode(token): int(rank)
+        for token, rank in (line.split() for line in contents.splitlines() if line)
+    }
+class QWenTokenizer(PretrainedTokenizer):
+    """QWen tokenizer."""
+    model_input_names = ["input_ids", "attention_mask", "position_ids"]
+    resource_files_names = VOCAB_FILES_NAMES
+    def __init__(
+        self,
+        vocab_file,
+        errors="replace",
+        padding_side="left",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if not is_tiktoken_available():
+            raise ValueError(
+                "tiktoken is not installed, please install it use: pip install tiktoken"
+            )
+        import tiktoken as tk
+        tiktoken = tk
+        self.errors = errors  # how to handle errors in decoding
+        self.mergeable_ranks = _load_tiktoken_bpe(vocab_file)  # type: dict[bytes, int]
+        self.special_tokens = {
+            token: index
+            for index, token in enumerate(
+                SPECIAL_TOKENS, start=len(self.mergeable_ranks)
+            )
+        }
+        enc = tiktoken.Encoding(
+            "Qwen",
+            pat_str=PAT_STR,
+            mergeable_ranks=self.mergeable_ranks,
+            special_tokens=self.special_tokens,
+        )
+        assert (
+            len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
+        ), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding"
+        self.decoder = {
+            v: k for k, v in self.mergeable_ranks.items()
+        }  # type: dict[int, bytes|str]
+        self.decoder.update({v: k for k, v in self.special_tokens.items()})
+        self.tokenizer = enc  # type: tiktoken.Encoding
+        self.eod_id = self.tokenizer.eot_token
+        self.im_start_id = self.special_tokens[IMSTART]
+        self.im_end_id = self.special_tokens[IMEND]
+        if "pad_token_id" in kwargs:
+            self.pad_token_id = kwargs["pad_token_id"]
+        if "eos_token_id" in kwargs:
+            self.eos_token_id = kwargs["eos_token_id"]
+    def __len__(self) -> int:
+        return self.tokenizer.n_vocab
+    def get_vocab(self) -> Dict[bytes, int]:
+        return self.mergeable_ranks
+    def convert_tokens_to_ids(
+        self, tokens: Union[bytes, str, List[Union[bytes, str]]]
+    ) -> List[int]:
+        ids = []
+        if isinstance(tokens, (str, bytes)):
+            if tokens in self.special_tokens:
+                return self.special_tokens[tokens]
+            else:
+                return self.mergeable_ranks.get(tokens)
+        for token in tokens:
+            if token in self.special_tokens:
+                ids.append(self.special_tokens[token])
+            else:
+                ids.append(self.mergeable_ranks.get(token))
+        return ids
+    def _update_tiktoken(self, tokens: List[str], special_tokens: bool = False) -> int:
+        if special_tokens:
+            added_tokens = []
+            for token in tokens:
+                if token in self.special_tokens:
+                    continue
+                token_id = len(self.mergeable_ranks) + len(self.special_tokens)
+                self.special_tokens[token] = token_id
+                self.decoder[token_id] = token
+                added_tokens.append(token)
+            import tiktoken
+            self.tokenizer = tiktoken.Encoding(
+                "Qwen",
+                pat_str=PAT_STR,
+                mergeable_ranks=self.mergeable_ranks,
+                special_tokens=self.special_tokens,
+            )
+            return len(added_tokens)
+        else:
+            raise ValueError("Adding regular tokens is not supported")
+    def _add_tokens(
+        self,
+        new_tokens: Union[List[str], List[AddedToken]],
+        special_tokens: bool = False,
+    ) -> int:
+        if not special_tokens and new_tokens:
+            raise ValueError("Adding regular tokens is not supported")
+        new_tokens_str = []
+        for token in new_tokens:
+            surface_form = token.content if isinstance(token, AddedToken) else token
+            new_tokens_str.append(surface_form)
+        return self._update_tiktoken(new_tokens_str, special_tokens)
+    def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
+        """
+        Save only the vocabulary of the tokenizer (vocabulary).
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        file_path = os.path.join(save_directory, "qwen.tiktoken")
+        with open(file_path, "w", encoding="utf8") as w:
+            for k, v in self.mergeable_ranks.items():
+                line = base64.b64encode(k).decode("utf8") + " " + str(v) + "\n"
+                w.write(line)
+        return (file_path,)
+    def tokenize(
+        self,
+        text: str,
+        allowed_special: Union[Set, str] = "all",
+        disallowed_special: Union[Collection, str] = (),
+        **kwargs,
+    ) -> List[Union[bytes, str]]:
+        """
+        Converts a string in a sequence of tokens.
+        Args:
+            text (`str`):
+                The sequence to be encoded.
+            allowed_special (`Literal["all"]` or `set`):
+                The surface forms of the tokens to be encoded as special tokens in regular texts.
+                Default to "all".
+            disallowed_special (`Literal["all"]` or `Collection`):
+                The surface forms of the tokens that should not be in regular texts and trigger errors.
+                Default to an empty tuple.
+            kwargs (additional keyword arguments, *optional*):
+                Will be passed to the underlying model specific encode method.
+        Returns:
+            `List[bytes|str]`: The list of tokens.
+        """
+        tokens = []
+        text = unicodedata.normalize("NFC", text)
+        # this implementation takes a detour: text -> token id -> token surface forms
+        for t in self.tokenizer.encode(
+            text, allowed_special=allowed_special, disallowed_special=disallowed_special
+        ):
+            tokens.append(self.decoder[t])
+        return tokens
+    def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
+        """
+        Converts a sequence of tokens in a single string.
+        """
+        text = ""
+        temp = b""
+        for t in tokens:
+            if isinstance(t, str):
+                if temp:
+                    text += temp.decode("utf-8", errors=self.errors)
+                    temp = b""
+                text += t
+            elif isinstance(t, bytes):
+                temp += t
+            else:
+                raise TypeError("token should only be of type types or str")
+        if temp:
+            text += temp.decode("utf-8", errors=self.errors)
+        return text
+    @property
+    def vocab_size(self):
+        return self.tokenizer.n_vocab
+    def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
+        """Converts an id to a token, special tokens included"""
+        if index in self.decoder:
+            return self.decoder[index]
+        raise ValueError("unknown ids")
+    def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
+        """Converts a token to an id using the vocab, special tokens included"""
+        if token in self.special_tokens:
+            return self.special_tokens[token]
+        if token in self.mergeable_ranks:
+            return self.mergeable_ranks[token]
+        raise ValueError("unknown token")
+    def _tokenize(self, text: str, **kwargs):
+        """
+        Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
+        vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
+        Do NOT take care of added tokens.
+        """
+        raise NotImplementedError
+    def _decode(
+        self,
+        token_ids: Union[int, List[int]],
+        skip_special_tokens: bool = False,
+        errors: str = None,
+        **kwargs,
+    ) -> str:
+        if isinstance(token_ids, int):
+            token_ids = [token_ids]
+        if skip_special_tokens:
+            token_ids = [i for i in token_ids if i < self.eod_id]
+        return self.tokenizer.decode(token_ids, errors=errors or self.errors)

paddlex/inference/models/common/tokenizer/tokenizer_utils.py CHANGED Viewed

@@ -239,7 +239,7 @@ def adapt_stale_fwd_patch(self, name, value):
                     "might be based on an old oversion which missing some "
                     f"arguments compared with the latest, such as {new_args}. "
                     "We automatically add compatibility on the patch for "
-                    "these arguemnts, and maybe the patch should be updated."
+                    "these arguments, and maybe the patch should be updated."
                 )
             else:
                 logging.warning(
@@ -247,7 +247,7 @@ def adapt_stale_fwd_patch(self, name, value):
                     "is patched and the patch might be conflict with patches made "
                     f"by paddlenlp which seems have more arguments such as {new_args}. "
                     "We automatically add compatibility on the patch for "
-                    "these arguemnts, and maybe the patch should be updated."
+                    "these arguments, and maybe the patch should be updated."
                 )
             if isinstance(self, paddle.nn.Layer) and inspect.isfunction(value):
@@ -290,8 +290,8 @@ class InitTrackerMeta(type):
     def __init__(cls, name, bases, attrs):
         init_func = cls.__init__
-        # If attrs has `__init__`, wrap it using accessable `_pre_init, _post_init`.
-        # Otherwise, no need to wrap again since the super cls has been wraped.
+        # If attrs has `__init__`, wrap it using accessible `_pre_init, _post_init`.
+        # Otherwise, no need to wrap again since the super cls has been wrapped.
         # TODO: remove reduplicated tracker if using super cls `__init__`
         pre_init_func = getattr(cls, "_pre_init", None) if "__init__" in attrs else None
         post_init_func = (
@@ -323,12 +323,12 @@ class InitTrackerMeta(type):
         @functools.wraps(init_func)
         def __impl__(self, *args, **kwargs):
-            # registed helper by `pre_init_func`
+            # registered helper by `pre_init_func`
             if pre_init_func:
                 pre_init_func(self, init_func, *args, **kwargs)
             # keep full configuration
             init_func(self, *args, **kwargs)
-            # registed helper by `post_init_func`
+            # registered helper by `post_init_func`
             if post_init_func:
                 post_init_func(self, init_func, *args, **kwargs)
             self.init_config = kwargs
@@ -588,7 +588,7 @@ def _is_control(char):
 def _is_nonnormalized_char(char):
-    """Check whther `chars` is a non-normalized character."""
+    """Check whether `chars` is a non-normalized character."""
     cp = ord(char)
     if (
         (0xFF00 <= cp <= 0xFFEF)
@@ -688,7 +688,7 @@ class ChatTemplateMixin:
             conversation = [[conversation]]
         elif isinstance(conversation, list) and isinstance(conversation[0], str):
             raise ValueError(
-                "apply_chat_template do not support appling batch conversations, "
+                "apply_chat_template do not support applying batch conversations, "
                 "so you should apply the conversation one by one."
             )
@@ -710,7 +710,7 @@ class ChatTemplateMixin:
                 conversations = conversation
             else:
                 raise ValueError(
-                    "apply_chat_template do not support appling batch conversations, "
+                    "apply_chat_template do not support applying batch conversations, "
                     "so you should apply the conversation one by one."
                 )
         query = self.chat_template.render(
@@ -847,7 +847,7 @@ class ChatTemplateMixin:
         self, origin_msg: List[Dict[str, str]], split_s: List[str]
     ):
         """Split the entire chat by specified words. Extract the non-learnable parts."""
-        # distingish and replace the special words in original string to an uncompiled form: Like | -> \|
+        # distinguish and replace the special words in original string to an uncompiled form: Like | -> \|
         regex_pattern = "|".join(map(re.escape, split_s))
         # splited by replaced specified words
         non_learnable_parts = re.split(
@@ -1738,7 +1738,7 @@ class PretrainedTokenizer(
                             [0] * len(pair_ids) if pair else []
                         )
                     encoded_inputs["offset_mapping"] = offset_mapping
-                    # Build output dictionnary
+                    # Build output dictionary
                     encoded_inputs["input_ids"] = sequence
                     if return_token_type_ids:
                         encoded_inputs["token_type_ids"] = token_type_ids
@@ -2108,7 +2108,7 @@ def _is_whitespace(char):
     """
     Checks whether `chars` is a whitespace character.
     """
-    # \t, \n, and \r are technically contorl characters but we treat them
+    # \t, \n, and \r are technically control characters but we treat them
     # as whitespace since they are generally considered as such.
     if char == " " or char == "\t" or char == "\n" or char == "\r":
         return True
@@ -2136,7 +2136,7 @@ def convert_to_unicode(text):
 def whitespace_tokenize(text):
     """
-    Runs basic whitespace cleaning and splitting on a peice of text.
+    Runs basic whitespace cleaning and splitting on a piece of text.
     Args:
         text (str): Text to be tokenized.
     Returns:

paddlex/inference/models/common/tokenizer/tokenizer_utils_base.py CHANGED Viewed

@@ -1634,7 +1634,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
         # From HF Hub or AI Studio
         if from_hf_hub or from_aistudio:
             # Only include the necessary resource files specified by the tokenizer cls
-            # Deep copy to avoid modifiying the class attributes
+            # Deep copy to avoid modifying the class attributes
             vocab_files = copy.deepcopy(cls.resource_files_names)
             vocab_files["tokenizer_config_file"] = cls.tokenizer_config_file
@@ -3110,7 +3110,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
             sequence = ids + pair_ids if pair else ids
             token_type_ids = [0] * len(ids) + ([0] * len(pair_ids) if pair else [])
-        # Build output dictionnary
+        # Build output dictionary
         encoded_inputs["input_ids"] = sequence
         if return_token_type_ids:
             encoded_inputs["token_type_ids"] = token_type_ids
@@ -3531,7 +3531,7 @@ class PretrainedTokenizerBase(SpecialTokensMixin):
         prefix_offset: int = 0,
         read_offset: int = 0,
     ) -> Tuple[str, int, int]:
-        """tokenizer decoding for the streaming generation use case. This method can be overrided for tokenizer that doesn't follow this API"""
+        """tokenizer decoding for the streaming generation use case. This method can be overridden for tokenizer that doesn't follow this API"""
         prefix_text = self.decode(
             all_input_ids[prefix_offset:read_offset],
             skip_special_tokens=False,

paddlex/inference/models/common/tokenizer/vocab.py CHANGED Viewed

@@ -27,8 +27,8 @@ class Vocab(object):
     store/load functions.
     Args:
-        counter (collections.Counter, optional): A Counter intance describes
-            the tokens and their frequencies. Its keys will be indexed accroding
+        counter (collections.Counter, optional): A Counter instance describes
+            the tokens and their frequencies. Its keys will be indexed according
             to the order of frequency sorting to construct mapping relationship.
             If None, `token_to_idx` must be provided as the mapping relationship.
             Default: None.
@@ -40,7 +40,7 @@ class Vocab(object):
             between tokens and indices to be used. If provided, adjust the tokens
             and indices mapping according to it. If None, counter must be provided.
             Default: None.
-        unk_token (str, optional): Special token for unknow token. If no need,
+        unk_token (str, optional): Special token for unknown token. If no need,
             it also could be None. Default: None.
         pad_token (str, optional): Special token for padding token. If no need,
             it also could be None. Default: None.
@@ -231,7 +231,7 @@ class Vocab(object):
         for idx in indices:
             if not isinstance(idx, (int, np.integer)):
                 warnings.warn(
-                    "The type of `to_tokens()`'s input `indices` is not `int` which will be forcibly transfered to `int`. "
+                    "The type of `to_tokens()`'s input `indices` is not `int` which will be forcibly transferred to `int`. "
                 )
                 idx = int(idx)
@@ -422,7 +422,7 @@ class Vocab(object):
         Args:
             token_to_idx (dict): A dict describes the mapping relationship between
                 tokens and indices.
-            unk_token (str, optional): The special token for unknow token. If
+            unk_token (str, optional): The special token for unknown token. If
                 no need, it also could be None. Default: None.
             pad_token (str, optional): The special token for padding token. If
                 no need, it also could be None. Default: None.
@@ -480,7 +480,7 @@ class Vocab(object):
         **kwargs
     ):
         """
-        Builds the :class:`Vocab` accoring to given iterator and other
+        Builds the :class:`Vocab` according to given iterator and other
         information. Firstly, iterate over the `iterator` to construct a
         :class:`collections.Counter` and used to init the as  :class:`Vocab`.
@@ -495,7 +495,7 @@ class Vocab(object):
                 relationship between tokens and indices to be used. If provided,
                 adjust the tokens and indices mapping according to it. If None,
                 counter must be provided. Default: None.
-            unk_token (str, optional): The special token for unknow token
+            unk_token (str, optional): The special token for unknown token
                 '<unk>'. If no need, it also could be None. Default: None.
             pad_token (str, optional): The special token for padding token
                 '<pad>'. If no need, it also could be None. Default: None.

paddlex/inference/models/common/vlm/conversion_utils.py ADDED Viewed

@@ -0,0 +1,99 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import paddle
+def fuse_param_func():
+    def fn(fuse_params, is_qkv=False, num_heads=None, num_key_value_heads=None):
+        concat_fn = np.concatenate
+        split_fn = np.split
+        if isinstance(fuse_params[0], paddle.Tensor):
+            concat_fn = paddle.concat
+            split_fn = paddle.split
+        if is_qkv:
+            assert (
+                num_heads
+            ), f"num_heads should be number of heads for Q, but got {num_heads}"
+            assert (
+                num_key_value_heads
+            ), f"num_key_value_heads should be number of key_value_heads for K and V, but got {num_key_value_heads}"
+            assert (
+                len(fuse_params) == 3
+            ), f"fuse_params length is not equal 3, it should be Q K V list. but got length {len(fuse_params)}"
+            num_query_groups = num_heads // num_key_value_heads
+            q_list = split_fn(fuse_params[0], num_heads, axis=-1)
+            k_list = split_fn(fuse_params[1], num_key_value_heads, axis=-1)
+            v_list = split_fn(fuse_params[2], num_key_value_heads, axis=-1)
+            qkv_pairs = []
+            for i in range(num_key_value_heads):
+                qkv_pairs += q_list[i * num_query_groups : (i + 1) * num_query_groups]
+                qkv_pairs.append(k_list[i])
+                qkv_pairs.append(v_list[i])
+            return concat_fn(qkv_pairs, axis=-1)
+        else:
+            return concat_fn(fuse_params, axis=-1)
+    return fn
+def split_param_func():
+    def fn(
+        fused_param,
+        split_nums=2,
+        is_qkv=False,
+        num_heads=None,
+        num_key_value_heads=None,
+    ):
+        concat_fn = np.concatenate
+        split_fn = np.split
+        if isinstance(fused_param, paddle.Tensor):
+            concat_fn = paddle.concat
+            split_fn = paddle.split
+        if is_qkv:
+            assert (
+                num_heads
+            ), f"num_heads should be number of heads for Q, but got {num_heads}"
+            assert (
+                num_key_value_heads
+            ), f"num_key_value_heads should be number of key_value_heads for K and V, but got {num_key_value_heads}"
+            num_query_groups = num_heads // num_key_value_heads
+            q_list, k_list, v_list = [], [], []
+            split_heads = split_fn(
+                fused_param, num_heads + 2 * num_key_value_heads, axis=-1
+            )
+            for i in range(num_key_value_heads):
+                q_list += split_heads[
+                    i * (num_query_groups + 2) : (i + 1) * (num_query_groups + 2) - 2
+                ]
+                k_list.append(split_heads[(i + 1) * (num_query_groups + 2) - 2])
+                v_list.append(split_heads[(i + 1) * (num_query_groups + 2) - 1])
+            return (
+                concat_fn(q_list, axis=-1),
+                concat_fn(k_list, axis=-1),
+                concat_fn(v_list, axis=-1),
+            )
+        else:
+            return split_fn(fused_param, split_nums, axis=-1)
+    return fn
+def split_or_fuse_func(is_fuse=True):
+    return fuse_param_func() if is_fuse else split_param_func()

paddlex 3.0.0rc1__py3-none-any.whl → 3.0.1__py3-none-any.whl

paddlex 3.0.0rc1py3-none-any.whl → 3.0.1py3-none-any.whl