PyPI - paddlex - Versions diffs - 3.0.0rc0__py3-none-any.whl → 3.0.0rc1__py3-none-any.whl - Mend

paddlex 3.0.0rc0py3-none-any.whl → 3.0.0rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (785) hide show

paddlex/inference/models/{3d_bev_detection → m_3d_bev_detection}/visualizer_3d.py RENAMED Viewed

@@ -1,8 +1,23 @@
-import os
-import numpy as np
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import argparse
 import importlib.util
-import sys
+import os
+import numpy as np
 class LazyLoader:
     def __init__(self, module_name):
@@ -14,11 +29,13 @@ class LazyLoader:
             self._module = importlib.import_module(self.module_name)
         return getattr(self._module, item)
-open3d = LazyLoader('open3d')
+open3d = LazyLoader("open3d")
 class Visualizer3D:
     def __init__(self):
-        self.vis = open3d.visualization.Visualizer() # initialize visualizer
+        self.vis = open3d.visualization.Visualizer()  # initialize visualizer
     def boxes_to_lines(self, box: np.ndarray):
         """
@@ -37,7 +54,9 @@ class Visualizer3D:
         box3d = open3d.geometry.OrientedBoundingBox(center, rot, lwh)
         return open3d.geometry.LineSet.create_from_oriented_bounding_box(box3d)
-    def draw_results(self, points: np.ndarray, result: dict, score_threshold: float) -> None:
+    def draw_results(
+        self, points: np.ndarray, result: dict, score_threshold: float
+    ) -> None:
         scores = result["scores"]
         bbox3d = result["bbox3d"]
         label_preds = result["labels"]
@@ -49,28 +68,46 @@ class Visualizer3D:
                 continue
             if bbox3d_dims == 9:
                 print(
-                    "Score: {} Label: {} Box(x_c, y_c, z_c, w, l, h, vec_x, vec_y, -rot): {} {} {} {} {} {} {} {} {}"
-                    .format(scores[box_idx], label_preds[box_idx],
-                            bbox3d[box_idx, 0], bbox3d[box_idx, 1],
-                            bbox3d[box_idx, 2], bbox3d[box_idx, 3],
-                            bbox3d[box_idx, 4], bbox3d[box_idx, 5],
-                            bbox3d[box_idx, 6], bbox3d[box_idx, 7],
-                            bbox3d[box_idx, 8]))
+                    "Score: {} Label: {} Box(x_c, y_c, z_c, w, l, h, vec_x, vec_y, -rot): {} {} {} {} {} {} {} {} {}".format(
+                        scores[box_idx],
+                        label_preds[box_idx],
+                        bbox3d[box_idx, 0],
+                        bbox3d[box_idx, 1],
+                        bbox3d[box_idx, 2],
+                        bbox3d[box_idx, 3],
+                        bbox3d[box_idx, 4],
+                        bbox3d[box_idx, 5],
+                        bbox3d[box_idx, 6],
+                        bbox3d[box_idx, 7],
+                        bbox3d[box_idx, 8],
+                    )
+                )
             elif bbox3d_dims == 7:
                 print(
-                    "Score: {} Label: {} Box(x_c, y_c, z_c, w, l, h, -rot): {} {} {} {} {} {} {}"
-                    .format(scores[box_idx], label_preds[box_idx],
-                            bbox3d[box_idx, 0], bbox3d[box_idx, 1],
-                            bbox3d[box_idx, 2], bbox3d[box_idx, 3],
-                            bbox3d[box_idx, 4], bbox3d[box_idx, 5],
-                            bbox3d[box_idx, 6]))
+                    "Score: {} Label: {} Box(x_c, y_c, z_c, w, l, h, -rot): {} {} {} {} {} {} {}".format(
+                        scores[box_idx],
+                        label_preds[box_idx],
+                        bbox3d[box_idx, 0],
+                        bbox3d[box_idx, 1],
+                        bbox3d[box_idx, 2],
+                        bbox3d[box_idx, 3],
+                        bbox3d[box_idx, 4],
+                        bbox3d[box_idx, 5],
+                        bbox3d[box_idx, 6],
+                    )
+                )
             # draw result
-            result_boxes.append([
-                bbox3d[box_idx, 0], bbox3d[box_idx, 1],
-                bbox3d[box_idx, 2], bbox3d[box_idx, 3],
-                bbox3d[box_idx, 4], bbox3d[box_idx, 5],
-                bbox3d[box_idx, -1]
-            ])
+            result_boxes.append(
+                [
+                    bbox3d[box_idx, 0],
+                    bbox3d[box_idx, 1],
+                    bbox3d[box_idx, 2],
+                    bbox3d[box_idx, 3],
+                    bbox3d[box_idx, 4],
+                    bbox3d[box_idx, 5],
+                    bbox3d[box_idx, -1],
+                ]
+            )
         # config
         self.vis.create_window()
@@ -93,13 +130,17 @@ class Visualizer3D:
             lines = self.boxes_to_lines(result_boxes[i])
             # show different colors for different classes
             if label_preds[i] <= 4:
-                obs_color = [0, 1, 0] # 'car', 'truck', 'trailer', 'bus', 'construction_vehicle',
-            elif (label_preds[i] <= 6):
-                obs_color = [0, 0, 1] # 'bicycle', 'motorcycle'
-            elif (label_preds[i] <= 7):
-                obs_color = [1, 0, 0] # 'pedestrian'
+                obs_color = [
+                    0,
+                    1,
+                    0,
+                ]  # 'car', 'truck', 'trailer', 'bus', 'construction_vehicle',
+            elif label_preds[i] <= 6:
+                obs_color = [0, 0, 1]  # 'bicycle', 'motorcycle'
+            elif label_preds[i] <= 7:
+                obs_color = [1, 0, 0]  # 'pedestrian'
             else:
-                obs_color = [1, 0, 1] # 'traffic_cone','barrier'
+                obs_color = [1, 0, 1]  # 'traffic_cone','barrier'
             lines.paint_uniform_color(obs_color)
             self.vis.add_geometry(lines)
@@ -112,20 +153,17 @@ class Visualizer3D:
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description='Visualizer 3d')
-    parser.add_argument(
-        '--save_path',
-        type=str,
-        default=None)
+    parser = argparse.ArgumentParser(description="Visualizer 3d")
+    parser.add_argument("--save_path", type=str, default=None)
     args = parser.parse_args()
     save_path = args.save_path
     if save_path is None:
         raise ValueError("Please specify the path to the saved results.")
     points = np.load(os.path.join(save_path, "points.npy"), allow_pickle=True)
     result = np.load(os.path.join(save_path, "results.npy"), allow_pickle=True).item()
     score_threshold = 0.25
     vis = Visualizer3D()
     vis.draw_results(points, result, score_threshold)

paddlex/inference/models/multilingual_speech_recognition/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

paddlex/inference/models/multilingual_speech_recognition/predictor.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,20 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import lazy_paddle as paddle
 import numpy as np
-from ....utils.func_register import FuncRegister
-from ...common.batch_sampler import AudioBatchSampler
-from ..base import BasicPredictor
-from .result import WhisperResult
-from ...utils.io import AudioReader
 from ....modules.multilingual_speech_recognition.model_list import MODELS
 from ....utils.download import download_and_extract
+from ...common.batch_sampler import AudioBatchSampler
+from ...utils.io import AudioReader
+from ..base import BasePredictor
+from .result import WhisperResult
-class WhisperPredictor(BasicPredictor):
+class WhisperPredictor(BasePredictor):
     entities = MODELS
@@ -62,12 +59,9 @@ class WhisperPredictor(BasicPredictor):
         Returns:
             AudioReader: An instance of AudioReader.
         """
-        from .processors import (
-            ModelDimensions,
-            Whisper,
-            LANGUAGES,
-            TO_LANGUAGE_CODE,
-        )
+        import paddle
+        from .processors import ModelDimensions, Whisper
         # build model
         model_file = (self.model_dir / f"{self.MODEL_FILE_PREFIX}.pdparams").as_posix()
@@ -91,6 +85,8 @@ class WhisperPredictor(BasicPredictor):
         Returns:
             dict: A dictionary containing the input path and result. The result include 'text', 'segments' and 'language'.
         """
+        import paddle
         from .processors import log_mel_spectrogram
         # load mel_filters from resource_dir and extract feature for audio

paddlex/inference/models/multilingual_speech_recognition/processors.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,19 +13,27 @@
 # limitations under the License.
 # Modified from OpenAI Whisper 2022 (https://github.com/openai/whisper/whisper)
 import os
-import tqdm
 import zlib
-import soundfile
-import numpy as np
-import lazy_paddle as paddle
-from dataclasses import dataclass
-from dataclasses import field
+from dataclasses import dataclass, field
 from functools import lru_cache
 from typing import Dict, Iterable, List, Optional, Sequence, Tuple, Union
+import numpy as np
+import paddle
+from ....utils.deps import function_requires_deps, is_dep_available
+from ...utils.benchmark import (
+    benchmark,
+    get_inference_operations,
+    set_inference_operations,
+)
 from ..common.tokenizer import GPTTokenizer
+if is_dep_available("soundfile"):
+    import soundfile
+if is_dep_available("tqdm"):
+    import tqdm
 __all__ = [
     "Whisper",
     "Tokenizer",
@@ -336,11 +344,9 @@ class Tokenizer:
         """
         Returns the list of tokens to suppress in order to avoid any speaker tags or non-speech
         annotations, to prevent sampling texts that are not actually spoken in the audio, e.g.
         - ♪♪♪
         - ( SPEAKING FOREIGN LANGUAGE )
         - [DAVID] Hey there,
         keeping basic punctuations like commas, periods, question marks, exclamation points, etc.
         """
         symbols = list('"#()*+/:;<=>@[\\]^_`{|}~「」『』')
@@ -706,7 +712,6 @@ class Inference:
     def cleanup_caching(self) -> None:
         """Clean up any resources or hooks after decoding is finished"""
-        pass
 class WhisperInference(Inference):
@@ -752,7 +757,6 @@ def detect_language(
     Detect the spoken language in the audio, and return them as list of strings, along with the ids
     of the most probable language tokens and the probability distribution over all language tokens.
     This is performed outside the main decode loop in order to not interfere with kv-caching.
     Returns
     -------
     language_tokens : Tensor, shape = (batch_size,)
@@ -804,6 +808,7 @@ def detect_language(
     return language_tokens, language_probs
+@function_requires_deps("tqdm")
 def transcribe(
     model: "Whisper",
     mel: paddle.Tensor,
@@ -819,41 +824,31 @@ def transcribe(
 ):
     """
     Transcribe an audio file using Whisper
     Parameters
     ----------
     model: Whisper
         The Whisper model instance
     mel: paddle.Tensor
         The audio feature
     verbose: bool
         Whether to display the text being decoded to the console. If True, displays all the details,
         If False, displays minimal details. If None, does not display anything
     temperature: Union[float, Tuple[float, ...]]
         Temperature for sampling. It can be a tuple of temperatures, which will be successfully used
         upon failures according to either `compression_ratio_threshold` or `logprob_threshold`.
     compression_ratio_threshold: float
         If the gzip compression ratio is above this value, treat as failed
     logprob_threshold: float
         If the average log probability over sampled tokens is below this value, treat as failed
     no_speech_threshold: float
         If the no_speech probability is higher than this value AND the average log probability
         over sampled tokens is below `logprob_threshold`, consider the segment as silent
     condition_on_previous_text: bool
         if True, the previous output of the model is provided as a prompt for the next window;
         disabling may make the text inconsistent across windows, but the model becomes less prone to
         getting stuck in a failure loop, such as repetition looping or timestamps going out of sync.
     decode_options: dict
         Keyword arguments to construct `DecodingOptions` instances
     Returns
     -------
     A dictionary containing the resulting text ("text") and segment-level details ("segments"), and
@@ -886,7 +881,10 @@ def transcribe(
     language = decode_options["language"]
     task = decode_options.get("task", "transcribe")
     tokenizer = get_tokenizer(
-        model.is_multilingual, resource_path=resource_path, language=language, task=task
+        model.is_multilingual,
+        resource_path=resource_path,
+        language=language,
+        task=task,
     )
     def decode_with_fallback(segment: paddle.Tensor) -> DecodingResult:
@@ -944,7 +942,11 @@ def transcribe(
         initial_prompt = []
     def add_segment(
-        *, start: float, end: float, text_tokens: paddle.Tensor, result: DecodingResult
+        *,
+        start: float,
+        end: float,
+        text_tokens: paddle.Tensor,
+        result: DecodingResult,
     ):
         text = tokenizer.decode(
             [token for token in text_tokens if token < tokenizer.eot]
@@ -1113,29 +1115,26 @@ class TokenDecoder:
         """Initialize any stateful variables for decoding a new sequence"""
     def update(
-        self, tokens: paddle.Tensor, logits: paddle.Tensor, sum_logprobs: paddle.Tensor
+        self,
+        tokens: paddle.Tensor,
+        logits: paddle.Tensor,
+        sum_logprobs: paddle.Tensor,
     ) -> Tuple[paddle.Tensor, bool]:
         """Specify how to select the next token, based on the current trace and logits
         Parameters
         ----------
         tokens : Tensor, shape = (n_batch, current_sequence_length)
             all tokens in the context so far, including the prefix and sot_sequence tokens
         logits : Tensor, shape = (n_batch, vocab_size)
             per-token logits of the probability distribution at the current step
         sum_logprobs : Tensor, shape = (n_batch)
             cumulative log probabilities for each sequence
         Returns
         -------
         tokens : Tensor, shape = (n_batch, current_sequence_length + 1)
             the tokens, appended with the selected next token
         completed : bool
             True if all sequences has reached the end of text
         """
         raise NotImplementedError
@@ -1143,23 +1142,18 @@ class TokenDecoder:
         self, tokens: paddle.Tensor, sum_logprobs: paddle.Tensor
     ) -> Tuple[Sequence[Sequence[paddle.Tensor]], List[List[float]]]:
         """Finalize search and return the final candidate sequences
         Parameters
         ----------
         tokens : Tensor, shape = (batch_size, beam_size, current_sequence_length)
             all tokens in the context so far, including the prefix and sot_sequence
         sum_logprobs : Tensor, shape = (batch_size, beam_size)
             cumulative log probabilities for each sequence
         Returns
         -------
         tokens : Sequence[Sequence[Tensor]], length = batch_size
             sequence of Tensors containing candidate token sequences, for each audio input
         sum_logprobs : List[List[float]], length = batch_size
             sequence of cumulative log probabilities corresponding to the above
         """
         raise NotImplementedError
@@ -1170,7 +1164,10 @@ class GreedyDecoder(TokenDecoder):
         self.eot = eot
     def update(
-        self, tokens: paddle.Tensor, logits: paddle.Tensor, sum_logprobs: paddle.Tensor
+        self,
+        tokens: paddle.Tensor,
+        logits: paddle.Tensor,
+        sum_logprobs: paddle.Tensor,
     ) -> Tuple[paddle.Tensor, bool]:
         temperature = self.temperature
         if temperature == 0:
@@ -1235,7 +1232,10 @@ class BeamSearchDecoder(TokenDecoder):
         self.finished_sequences = None
     def update(
-        self, tokens: paddle.Tensor, logits: paddle.Tensor, sum_logprobs: paddle.Tensor
+        self,
+        tokens: paddle.Tensor,
+        logits: paddle.Tensor,
+        sum_logprobs: paddle.Tensor,
     ) -> Tuple[paddle.Tensor, bool]:
         if tokens.shape[0] % self.beam_size != 0:
             raise ValueError(f"{tokens.shape}[0] % {self.beam_size} != 0")
@@ -1564,7 +1564,10 @@ class DecodingTask:
         return audio_features
     def _detect_language(
-        self, audio_features: paddle.Tensor, tokens: paddle.Tensor, resource_path: str
+        self,
+        audio_features: paddle.Tensor,
+        tokens: paddle.Tensor,
+        resource_path: str,
     ):
         languages = [self.options.language] * audio_features.shape[0]
         lang_probs = None
@@ -1656,20 +1659,16 @@ class DecodingTask:
             ]
         # repeat the audio & text tensors by the group size, for beam search or best-of-n sampling
         audio_features = paddle.repeat_interleave(
             audio_features, self.beam_size, axis=0
         )
         tokens = paddle.repeat_interleave(tokens, self.beam_size, axis=0)
         # call the main sampling loop
         tokens, sum_logprobs, no_speech_probs = self._main_loop(audio_features, tokens)
         # reshape the tensors to have (batch_size, beam_size) as the first two dimensions
         audio_features = audio_features[:: self.beam_size]
         no_speech_probs = no_speech_probs[:: self.beam_size]
         assert audio_features.shape[0] == len(no_speech_probs) == batch_size
         tokens = tokens.reshape([batch_size, self.beam_size, -1])
         sum_logprobs = sum_logprobs.reshape([batch_size, self.beam_size])
@@ -1727,18 +1726,14 @@ def decode(
 ) -> Union[DecodingResult, List[DecodingResult]]:
     """
     Performs decoding of 30-second audio segment(s), provided as Mel spectrogram(s).
     Parameters
     ----------
     model: Whisper
         the Whisper model instance
     mel: paddle.Tensor, shape = (80, 3000) or (*, 80, 3000)
         A tensor containing the Mel spectrogram(s)
     options: DecodingOptions
         A dataclass that contains all necessary options for decoding 30-second segments
     Returns
     -------
     result: Union[DecodingResult, List[DecodingResult]]
@@ -1804,7 +1799,6 @@ class Whisper(paddle.nn.Layer):
         tensors calculated for the previous positions. This method returns a dictionary that stores
         all caches, and the necessary hooks for the key and value projection modules that save the
         intermediate tensors to be reused during later calculations.
         Returns
         -------
         cache : Dict[nn.Layer, paddle.Tensor]
@@ -1836,7 +1830,8 @@ class Whisper(paddle.nn.Layer):
         return cache, hooks
     detect_language = detect_language
-    transcribe = transcribe
+    set_inference_operations(get_inference_operations() + ["speech_transcribe"])
+    transcribe = benchmark.timeit_with_options(name="speech_transcribe")(transcribe)
     decode = decode
@@ -1888,7 +1883,6 @@ def mel_filters(resource_path: str, n_mels: int = N_MELS) -> paddle.Tensor:
     """
     load the mel filterbank matrix for projecting STFT into a Mel spectrogram.
     Allows decoupling librosa dependency; saved using:
         np.savez_compressed(
             "mel_filters.npz",
             mel_80=librosa.filters.mel(sr=16000, n_fft=400, n_mels=80),
@@ -1899,6 +1893,7 @@ def mel_filters(resource_path: str, n_mels: int = N_MELS) -> paddle.Tensor:
         return paddle.to_tensor(f[f"mel_{n_mels}"])
+@function_requires_deps("soundfile")
 def log_mel_spectrogram(
     audio: Union[str, np.ndarray, paddle.Tensor],
     n_mels: int = N_MELS,
@@ -1906,15 +1901,12 @@ def log_mel_spectrogram(
 ):
     """
     Compute the log-Mel spectrogram of
     Parameters
     ----------
     audio: Union[str, np.ndarray, paddle.Tensor], shape = (*)
         The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz
     n_mels: int
         The number of Mel-frequency filters, only 80 is supported
     Returns
     -------
     paddle.Tensor, shape = (80, n_frames)

paddlex/inference/models/multilingual_speech_recognition/result.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

paddlex/inference/models/object_detection/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

paddlex/inference/models/object_detection/predictor.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,16 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, List, Sequence, Optional, Union, Tuple
+from typing import Any, List, Optional, Sequence, Tuple, Union
 import numpy as np
-from ....utils.func_register import FuncRegister
 from ....modules.object_detection.model_list import MODELS
+from ....utils.func_register import FuncRegister
 from ...common.batch_sampler import ImageBatchSampler
-from ..common import StaticInfer
-from ..base import BasicPredictor
+from ..base import BasePredictor
 from .processors import (
     DetPad,
     DetPostProcess,
@@ -37,7 +35,7 @@ from .result import DetResult
 from .utils import STATIC_SHAPE_MODEL_LIST
-class DetPredictor(BasicPredictor):
+class DetPredictor(BasePredictor):
     entities = MODELS
@@ -142,11 +140,7 @@ class DetPredictor(BasicPredictor):
             pre_ops.insert(1, self.build_resize(self.img_size, False, 2))
         # build infer
-        infer = StaticInfer(
-            model_dir=self.model_dir,
-            model_prefix=self.MODEL_FILE_PREFIX,
-            option=self.pp_option,
-        )
+        infer = self.create_static_infer()
         # build postprocess op
         post_op = self.build_postprocess()

paddlex 3.0.0rc0__py3-none-any.whl → 3.0.0rc1__py3-none-any.whl

paddlex 3.0.0rc0py3-none-any.whl → 3.0.0rc1py3-none-any.whl