PyPI - paddlex - Versions diffs - 3.0.0rc1__py3-none-any.whl → 3.0.2__py3-none-any.whl - Mend

paddlex 3.0.0rc1py3-none-any.whl → 3.0.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (240) hide show

paddlex/inference/common/batch_sampler/doc_vlm_batch_sampler.py CHANGED Viewed

@@ -18,14 +18,26 @@ from .base_batch_sampler import BaseBatchSampler
 class DocVLMBatchSampler(BaseBatchSampler):
-    def __init__(self):
+    model_names_only_supports_batchsize_of_one = {"PP-DocBee-2B", "PP-DocBee-7B"}
+    def __init__(self, model_name, batch_size: int = 1) -> None:
         """Initializes the BaseBatchSampler.
         Args:
+            model_name (str): The name of the model.
             batch_size (int, optional): The size of each batch. Only support 1.
         """
-        super().__init__()
-        self.batch_size = 1
+        self.model_name = model_name
+        if (
+            self.model_name in self.model_names_only_supports_batchsize_of_one
+            and batch_size != 1
+        ):
+            logging.warning(
+                f"doc vlm batch sampler only support batch size 1 for {self.model_name}, but got {batch_size} and it will not take effect."
+            )
+            batch_size = 1
+        super().__init__(batch_size)
     def sample(self, inputs):
         """Generate list of input file path.
@@ -37,14 +49,22 @@ class DocVLMBatchSampler(BaseBatchSampler):
             list: list of file path.
         """
         if isinstance(inputs, dict):
-            yield [inputs]
-        elif isinstance(inputs, list) and all(isinstance(i, dict) for i in inputs):
-            yield inputs
-        else:
+            inputs = [inputs]
+        if not (isinstance(inputs, list) and all(isinstance(i, dict) for i in inputs)):
             raise TypeError(
-                f"Not supported input data type! Only `dict` are supported, but got: {type(inputs)}."
+                f"Not supported input data type! Only `Dict` or `List[Dict]` are supported, but got: {type(inputs)}."
             )
+        batch = []
+        for input_ in inputs:
+            batch.append(input_)
+            if len(batch) == self.batch_size:
+                yield batch
+                batch = []
+        if len(batch) > 0:
+            yield batch
     @BaseBatchSampler.batch_size.setter
     def batch_size(self, batch_size):
         """Sets the batch size.
@@ -56,9 +76,12 @@ class DocVLMBatchSampler(BaseBatchSampler):
             Warning: If the batch size is not equal 1.
         """
         # only support batch size 1
-        if batch_size != 1:
+        if (
+            self.model_name in self.model_names_only_supports_batchsize_of_one
+            and batch_size != 1
+        ):
             logging.warning(
-                f"doc vlm batch sampler only support batch size 1, but got {batch_size}."
+                f"doc vlm batch sampler only support batch size 1 for {self.model_name}, but got {batch_size} and it will not take effect."
             )
         else:
             self._batch_size = batch_size

paddlex/inference/common/batch_sampler/image_batch_sampler.py CHANGED Viewed

@@ -40,7 +40,8 @@ class ImgBatch(Batch):
 class ImageBatchSampler(BaseBatchSampler):
-    SUFFIX = ["jpg", "png", "jpeg", "JPEG", "JPG", "bmp"]
+    IMG_SUFFIX = ["jpg", "png", "jpeg", "bmp"]
+    PDF_SUFFIX = ["pdf"]
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -54,16 +55,19 @@ class ImageBatchSampler(BaseBatchSampler):
         return save_path.as_posix()
     def _get_files_list(self, fp):
-        file_list = []
         if fp is None or not os.path.exists(fp):
-            raise Exception(f"Not found any img file in path: {fp}")
+            raise Exception(f"Not found any files in path: {fp}")
+        if os.path.isfile(fp):
+            return [fp]
-        if os.path.isfile(fp) and fp.split(".")[-1] in self.SUFFIX:
-            file_list.append(fp)
-        elif os.path.isdir(fp):
+        file_list = []
+        if os.path.isdir(fp):
             for root, dirs, files in os.walk(fp):
                 for single_file in files:
-                    if single_file.split(".")[-1] in self.SUFFIX:
+                    if (
+                        single_file.split(".")[-1].lower()
+                        in self.IMG_SUFFIX + self.PDF_SUFFIX
+                    ):
                         file_list.append(os.path.join(root, single_file))
         if len(file_list) == 0:
             raise Exception("Not found any file in {}".format(fp))
@@ -81,29 +85,34 @@ class ImageBatchSampler(BaseBatchSampler):
                 if len(batch) == self.batch_size:
                     yield batch
                     batch = ImgBatch()
-            elif isinstance(input, str) and input.split(".")[-1] in ("PDF", "pdf"):
-                file_path = (
-                    self._download_from_url(input)
-                    if input.startswith("http")
-                    else input
-                )
-                for page_idx, page_img in enumerate(self.pdf_reader.read(file_path)):
-                    batch.append(page_img, file_path, page_idx)
-                    if len(batch) == self.batch_size:
-                        yield batch
-                        batch = ImgBatch()
             elif isinstance(input, str):
-                file_path = (
-                    self._download_from_url(input)
-                    if input.startswith("http")
-                    else input
-                )
-                file_list = self._get_files_list(file_path)
-                for file_path in file_list:
+                suffix = input.split(".")[-1].lower()
+                if suffix in self.PDF_SUFFIX:
+                    file_path = (
+                        self._download_from_url(input)
+                        if input.startswith("http")
+                        else input
+                    )
+                    for page_idx, page_img in enumerate(
+                        self.pdf_reader.read(file_path)
+                    ):
+                        batch.append(page_img, file_path, page_idx)
+                        if len(batch) == self.batch_size:
+                            yield batch
+                            batch = ImgBatch()
+                elif suffix in self.IMG_SUFFIX:
+                    file_path = (
+                        self._download_from_url(input)
+                        if input.startswith("http")
+                        else input
+                    )
                     batch.append(file_path, file_path, None)
                     if len(batch) == self.batch_size:
                         yield batch
                         batch = ImgBatch()
+                else:
+                    file_list = self._get_files_list(input)
+                    yield from self.sample(file_list)
             else:
                 logging.warning(
                     f"Not supported input data type! Only `numpy.ndarray` and `str` are supported! So has been ignored: {input}."

paddlex/inference/common/result/mixin.py CHANGED Viewed

@@ -161,7 +161,7 @@ class JsonMixin:
         else:
             if len(json_data) > 1:
                 logging.warning(
-                    f"The result has multiple json files need to be saved. But the `save_path` has been specfied as `{save_path}`!"
+                    f"The result has multiple json files need to be saved. But the `save_path` has been specified as `{save_path}`!"
                 )
             self._json_writer.write(
                 save_path,
@@ -264,7 +264,7 @@ class Base64Mixin:
         else:
             if len(base64) > 1:
                 logging.warning(
-                    f"The result has multiple base64 files need to be saved. But the `save_path` has been specfied as `{save_path}`!"
+                    f"The result has multiple base64 files need to be saved. But the `save_path` has been specified as `{save_path}`!"
                 )
             self._base64_writer.write(
                 save_path, base64[list(base64.keys())[0]], *args, **kwargs
@@ -328,7 +328,7 @@ class ImgMixin:
         else:
             if len(img) > 1:
                 logging.warning(
-                    f"The result has multiple img files need to be saved. But the `save_path` has been specfied as `{save_path}`!"
+                    f"The result has multiple img files need to be saved. But the `save_path` has been specified as `{save_path}`!"
                 )
             self._img_writer.write(save_path, img[list(img.keys())[0]], *args, **kwargs)
@@ -392,7 +392,7 @@ class CSVMixin:
         else:
             if len(csv) > 1:
                 logging.warning(
-                    f"The result has multiple csv files need to be saved. But the `save_path` has been specfied as `{save_path}`!"
+                    f"The result has multiple csv files need to be saved. But the `save_path` has been specified as `{save_path}`!"
                 )
             self._csv_writer.write(save_path, csv[list(csv.keys())[0]], *args, **kwargs)
@@ -455,7 +455,7 @@ class HtmlMixin:
         else:
             if len(html) > 1:
                 logging.warning(
-                    f"The result has multiple html files need to be saved. But the `save_path` has been specfied as `{save_path}`!"
+                    f"The result has multiple html files need to be saved. But the `save_path` has been specified as `{save_path}`!"
                 )
             self._html_writer.write(
                 save_path, html[list(html.keys())[0]], *args, **kwargs
@@ -524,7 +524,7 @@ class XlsxMixin:
         else:
             if len(xlsx) > 1:
                 logging.warning(
-                    f"The result has multiple xlsx files need to be saved. But the `save_path` has been specfied as `{save_path}`!"
+                    f"The result has multiple xlsx files need to be saved. But the `save_path` has been specified as `{save_path}`!"
                 )
             self._xlsx_writer.write(
                 save_path, xlsx[list(xlsx.keys())[0]], *args, **kwargs
@@ -589,7 +589,7 @@ class VideoMixin:
         else:
             if len(video) > 1:
                 logging.warning(
-                    f"The result has multiple video files need to be saved. But the `save_path` has been specfied as `{save_path}`!"
+                    f"The result has multiple video files need to be saved. But the `save_path` has been specified as `{save_path}`!"
                 )
             video_writer.write(save_path, video[list(video.keys())[0]], *args, **kwargs)
@@ -609,10 +609,13 @@ class MarkdownMixin:
         self._save_funcs.append(self.save_to_markdown)
     @abstractmethod
-    def _to_markdown(self) -> Dict[str, Union[str, Dict[str, Any]]]:
+    def _to_markdown(self, pretty=True) -> Dict[str, Union[str, Dict[str, Any]]]:
         """
         Convert the result to markdown format.
+        Args:
+            pretty (Optional[bool]): whether to pretty markdown by HTML, default by True.
         Returns:
             Dict[str, Union[str, Dict[str, Any]]]: A dictionary containing markdown text and image data.
         """
@@ -627,7 +630,7 @@ class MarkdownMixin:
         """
         return self._to_markdown()
-    def save_to_markdown(self, save_path, *args, **kwargs) -> None:
+    def save_to_markdown(self, save_path, pretty=True, *args, **kwargs) -> None:
         """Save the markdown data to a file.
         Args:
@@ -665,7 +668,7 @@ class MarkdownMixin:
             self._markdown_writer.write,
             self._img_writer.write,
             self.save_path,
-            self._to_markdown(),
+            self._to_markdown(pretty=pretty),
             *args,
             **kwargs,
         )
@@ -698,5 +701,9 @@ class MarkdownMixin:
             if isinstance(value, dict):
                 base_save_path = save_path.parent
                 for img_path, img_data in value.items():
-                    save_path = base_save_path / img_path
-                    save_img_func(save_path.as_posix(), img_data, *args, **kwargs)
+                    save_img_func(
+                        (base_save_path / img_path).as_posix(),
+                        img_data,
+                        *args,
+                        **kwargs,
+                    )

paddlex/inference/models/base/predictor/base_predictor.py CHANGED Viewed

@@ -118,17 +118,9 @@ class BasePredictor(
         self.batch_sampler.batch_size = batch_size
         self._use_hpip = use_hpip
         if not use_hpip:
-            if hpi_config is not None:
-                logging.warning(
-                    "`hpi_config` will be ignored when not using the high-performance inference plugin."
-                )
             self._pp_option = self._prepare_pp_option(pp_option, device)
         else:
             require_hpip()
-            if pp_option is not None:
-                logging.warning(
-                    "`pp_option` will be ignored when using the high-performance inference plugin."
-                )
             self._hpi_config = self._prepare_hpi_config(hpi_config, device)
         logging.debug(f"{self.__class__.__name__}: {self.model_dir}")
@@ -343,6 +335,8 @@ class BasePredictor(
             device_info = None
         if pp_option is None:
             pp_option = PaddlePredictorOption(model_name=self.model_name)
+        elif pp_option.model_name is None:
+            pp_option.model_name = self.model_name
         if device_info:
             pp_option.device_type = device_info[0]
             pp_option.device_id = device_info[1]

paddlex/inference/models/common/static_infer.py CHANGED Viewed

@@ -22,8 +22,7 @@ import numpy as np
 from ....utils import logging
 from ....utils.deps import class_requires_deps
-from ....utils.device import constr_device
-from ....utils.flags import DEBUG, INFER_BENCHMARK_USE_NEW_INFER_API, USE_PIR_TRT
+from ....utils.flags import DEBUG, USE_PIR_TRT
 from ...utils.benchmark import benchmark, set_inference_operations
 from ...utils.hpi import (
     HPIConfig,
@@ -34,15 +33,12 @@ from ...utils.hpi import (
     suggest_inference_backend_and_config,
 )
 from ...utils.model_paths import get_model_paths
-from ...utils.pp_option import PaddlePredictorOption
+from ...utils.pp_option import PaddlePredictorOption, get_default_run_mode
 from ...utils.trt_config import DISABLE_TRT_HALF_OPS_CONFIG
 CACHE_DIR = ".cache"
 INFERENCE_OPERATIONS = [
-    "PaddleCopyToDevice",
-    "PaddleCopyToHost",
-    "PaddleModelInfer",
     "PaddleInferChainLegacy",
     "MultiBackendInfer",
 ]
@@ -233,47 +229,6 @@ def _sort_inputs(inputs, names):
     return inputs
-def _concatenate(*callables):
-    def _chain(x):
-        for c in callables:
-            x = c(x)
-        return x
-    return _chain
-@benchmark.timeit
-class PaddleCopyToDevice:
-    def __init__(self, device_type, device_id):
-        self.device_type = device_type
-        self.device_id = device_id
-    def __call__(self, arrs):
-        import paddle
-        device_id = [self.device_id] if self.device_id is not None else self.device_id
-        device = constr_device(self.device_type, device_id)
-        paddle_tensors = [paddle.to_tensor(i, place=device) for i in arrs]
-        return paddle_tensors
-@benchmark.timeit
-class PaddleCopyToHost:
-    def __call__(self, paddle_tensors):
-        arrs = [i.numpy() for i in paddle_tensors]
-        return arrs
-@benchmark.timeit
-class PaddleModelInfer:
-    def __init__(self, predictor):
-        super().__init__()
-        self.predictor = predictor
-    def __call__(self, x):
-        return self.predictor.run(x)
 # FIXME: Name might be misleading
 @benchmark.timeit
 class PaddleInferChainLegacy:
@@ -317,15 +272,7 @@ class PaddleInfer(StaticInfer):
         self.model_file_prefix = model_file_prefix
         self._option = option
         self.predictor = self._create()
-        if INFER_BENCHMARK_USE_NEW_INFER_API:
-            device_type = self._option.device_type
-            device_type = "gpu" if device_type == "dcu" else device_type
-            copy_to_device = PaddleCopyToDevice(device_type, self._option.device_id)
-            copy_to_host = PaddleCopyToHost()
-            model_infer = PaddleModelInfer(self.predictor)
-            self.infer = _concatenate(copy_to_device, model_infer, copy_to_host)
-        else:
-            self.infer = PaddleInferChainLegacy(self.predictor)
+        self.infer = PaddleInferChainLegacy(self.predictor)
     def __call__(self, x: Sequence[np.ndarray]) -> List[np.ndarray]:
         names = self.predictor.get_input_names()
@@ -373,7 +320,7 @@ class PaddleInfer(StaticInfer):
             logging.debug("`device_id` has been set to None")
         if (
-            self._option.device_type in ("gpu", "dcu")
+            self._option.device_type in ("gpu", "dcu", "npu", "mlu", "gcu", "xpu")
             and self._option.device_id is None
         ):
             self._option.device_id = 0
@@ -402,6 +349,7 @@ class PaddleInfer(StaticInfer):
                     if self._option.run_mode == "paddle_fp16"
                     else PrecisionType.Float32
                 )
+                config.disable_mkldnn()
                 config.enable_use_gpu(100, self._option.device_id, precision)
                 if hasattr(config, "enable_new_ir"):
                     config.enable_new_ir(self._option.enable_new_ir)
@@ -417,12 +365,16 @@ class PaddleInfer(StaticInfer):
                 if hasattr(config, "enable_new_executor"):
                     config.enable_new_executor()
             elif self._option.device_type == "xpu":
+                config.enable_xpu()
+                config.set_xpu_device_id(self._option.device_id)
                 if hasattr(config, "enable_new_ir"):
                     config.enable_new_ir(self._option.enable_new_ir)
                 if hasattr(config, "enable_new_executor"):
                     config.enable_new_executor()
+                config.delete_pass("conv2d_bn_xpu_fuse_pass")
+                config.delete_pass("transfer_layout_pass")
             elif self._option.device_type == "mlu":
-                config.enable_custom_device("mlu")
+                config.enable_custom_device("mlu", self._option.device_id)
                 if hasattr(config, "enable_new_ir"):
                     config.enable_new_ir(self._option.enable_new_ir)
                 if hasattr(config, "enable_new_executor"):
@@ -431,7 +383,7 @@ class PaddleInfer(StaticInfer):
                 from paddle_custom_device.gcu import passes as gcu_passes
                 gcu_passes.setUp()
-                config.enable_custom_device("gcu")
+                config.enable_custom_device("gcu", self._option.device_id)
                 if hasattr(config, "enable_new_ir"):
                     config.enable_new_ir()
                 if hasattr(config, "enable_new_executor"):
@@ -455,15 +407,10 @@ class PaddleInfer(StaticInfer):
                 assert self._option.device_type == "cpu"
                 config.disable_gpu()
                 if "mkldnn" in self._option.run_mode:
-                    try:
-                        config.enable_mkldnn()
-                        if "bf16" in self._option.run_mode:
-                            config.enable_mkldnn_bfloat16()
-                    except Exception:
-                        logging.warning(
-                            "MKL-DNN is not available. We will disable MKL-DNN."
-                        )
-                    config.set_mkldnn_cache_capacity(-1)
+                    config.enable_mkldnn()
+                    if "bf16" in self._option.run_mode:
+                        config.enable_mkldnn_bfloat16()
+                    config.set_mkldnn_cache_capacity(self._option.mkldnn_cache_capacity)
                 else:
                     if hasattr(config, "disable_mkldnn"):
                         config.disable_mkldnn()
@@ -687,10 +634,19 @@ class HPInfer(StaticInfer):
                 )
             backend_config = self._config.backend_config or {}
-        if backend == "paddle" and not backend_config:
-            logging.warning(
-                "The Paddle Inference backend is selected with the default configuration. This may not provide optimal performance."
-            )
+        if backend == "paddle":
+            if not backend_config:
+                is_default_config = True
+            elif backend_config.keys() != {"run_mode"}:
+                is_default_config = False
+            else:
+                is_default_config = backend_config["run_mode"] == get_default_run_mode(
+                    self._config.pdx_model_name, self._config.device_type
+                )
+            if is_default_config:
+                logging.warning(
+                    "The Paddle Inference backend is selected with the default configuration. This may not provide optimal performance."
+                )
         return backend, backend_config
@@ -833,7 +789,7 @@ class HPInfer(StaticInfer):
                     for name, shapes in backend_config.dynamic_shapes.items():
                         ui_option.trt_option.set_shape(name, *shapes)
                 else:
-                    logging.warning(
+                    logging.info(
                         "TensorRT dynamic shapes will be loaded from the file."
                     )
         elif backend == "om":

paddlex/inference/models/common/tokenizer/__init__.py CHANGED Viewed

@@ -15,5 +15,7 @@
 from .bert_tokenizer import BertTokenizer
 from .clip_tokenizer import CLIPTokenizer
 from .gpt_tokenizer import GPTTokenizer
+from .qwen2_5_tokenizer import MIXQwen2_5_Tokenizer
 from .qwen2_tokenizer import MIXQwen2Tokenizer, Qwen2Tokenizer
+from .qwen_tokenizer import QWenTokenizer
 from .tokenizer_utils import PretrainedTokenizer

paddlex/inference/models/common/tokenizer/clip_tokenizer.py CHANGED Viewed

@@ -403,7 +403,7 @@ class CLIPTokenizer(PretrainedTokenizer):
         Returns the size of vocabulary.
         Returns:
-            int: The sum of size of vocabulary and the size of speical tokens.
+            int: The sum of size of vocabulary and the size of special tokens.
         """
         return len(self.encoder)

paddlex/inference/models/common/tokenizer/gpt_tokenizer.py CHANGED Viewed

@@ -41,7 +41,7 @@ def bytes_to_unicode():
     The reversible bpe codes work on unicode strings.
     This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
     When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
-    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    This is a significant percentage of your normal, say, 32K bpe vocab.
     To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
     And avoids mapping to whitespace/control characters the bpe code barfs on.
     """
@@ -241,7 +241,7 @@ class GPTTokenizer(PretrainedTokenizer):
         Returns the size of vocabulary.
         Returns:
-            int: The sum of size of vocabulary and the size of speical tokens.
+            int: The sum of size of vocabulary and the size of special tokens.
         """

paddlex/inference/models/common/tokenizer/qwen2_5_tokenizer.py ADDED Viewed

@@ -0,0 +1,112 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+from typing import List
+from .qwen2_tokenizer import Qwen2Tokenizer
+from .tokenizer_utils_base import AddedToken, TextInput
+class MIXQwen2_5_Tokenizer(Qwen2Tokenizer):
+    def __init__(self, *args, **kwargs):
+        super(MIXQwen2_5_Tokenizer, self).__init__(*args, **kwargs)
+    def tokenize(self, text: TextInput, **kwargs) -> List[str]:
+        """
+        Converts a string in a sequence of tokens, using the tokenizer.
+        Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies
+        (BPE/SentencePieces/WordPieces). Takes care of added tokens.
+        Args:
+            text (`str`):
+                The sequence to be encoded.
+            **kwargs (additional keyword arguments):
+                Passed along to the model-specific `prepare_for_tokenization` preprocessing method.
+        Returns:
+            `List[str]`: The list of tokens.
+        """
+        split_special_tokens = kwargs.pop(
+            "split_special_tokens", self.split_special_tokens
+        )
+        all_special_tokens_extended = dict(
+            (str(t), t)
+            for t in self.all_special_tokens_extended
+            if isinstance(t, AddedToken)
+        )
+        # Add special tokens
+        for t in self.added_tokens_decoder:
+            token = self.added_tokens_decoder[t]
+            if isinstance(token, AddedToken) and token.special:
+                all_special_tokens_extended[str(token)] = token
+                if str(token) not in self.all_special_tokens:
+                    self.all_special_tokens.append(str(token))
+                if str(token) not in self.unique_no_split_tokens:
+                    self.unique_no_split_tokens.append(str(token))
+        self._create_trie(self.unique_no_split_tokens)
+        text, kwargs = self.prepare_for_tokenization(text, **kwargs)
+        # TODO: should this be in the base class?
+        if hasattr(self, "do_lower_case") and self.do_lower_case:
+            # convert non-special tokens to lowercase
+            escaped_special_toks = [
+                re.escape(s_tok)
+                for s_tok in (self.unique_no_split_tokens + self.all_special_tokens)
+            ]
+            pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
+            text = re.sub(
+                pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text
+            )
+        if split_special_tokens:
+            no_split_token = []
+            tokens = [text]
+        else:
+            no_split_token = set(
+                self.unique_no_split_tokens
+            )  # don't split on any of the added tokens
+            tokens = self.tokens_trie.split(text)
+        for i, token in enumerate(tokens):
+            if token in no_split_token:
+                tok_extended = all_special_tokens_extended.get(token, None)
+                left = tokens[i - 1] if i > 0 else None
+                right = tokens[i + 1] if i < len(tokens) - 1 else None
+                if isinstance(tok_extended, AddedToken):
+                    if tok_extended.rstrip and right:
+                        # A bit counter-intuitive but we strip the left of the string
+                        # since tok_extended.rstrip means the special token is eating all white spaces on its right
+                        tokens[i + 1] = right.lstrip()
+                    # Strip white spaces on the left
+                    if tok_extended.lstrip and left:
+                        tokens[i - 1] = left.rstrip()  # Opposite here
+        tokenized_text = []
+        for token in tokens:
+            # Need to skip eventual empty (fully stripped) tokens
+            if not token:
+                continue
+            if token in no_split_token:
+                tokenized_text.append(token)
+            else:
+                tokenized_text.extend(self._tokenize(token))
+        return tokenized_text

paddlex/inference/models/common/tokenizer/qwen2_tokenizer.py CHANGED Viewed

@@ -18,6 +18,7 @@ import unicodedata
 from functools import lru_cache
 from typing import List, Optional, Tuple
+from .....utils import logging
 from .....utils.deps import is_dep_available
 from .tokenizer_utils import PretrainedTokenizer
 from .tokenizer_utils_base import AddedToken, TextInput
@@ -146,7 +147,12 @@ class Qwen2Tokenizer(PretrainedTokenizer):
         split_special_tokens=False,
         **kwargs,
     ):
-        super().__init__(**kwargs)
+        if unk_token is None:
+            logging.info(
+                "The `unk_token` parameter needs to be defined: we use `eos_token` by default."
+            )
+            unk_token = eos_token
         # Qwen vocab does not contain control tokens; added tokens need to be special
         bos_token = (
             AddedToken(

paddlex 3.0.0rc1__py3-none-any.whl → 3.0.2__py3-none-any.whl

paddlex 3.0.0rc1py3-none-any.whl → 3.0.2py3-none-any.whl