PyPI - nexaai - Versions diffs - 1.0.29__cp310-cp310-macosx_14_0_universal2.whl - Mend

nexaai 1.0.29__cp310-cp310-macosx_14_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (580) hide show

nexaai/mlx_backend/mlx_audio/stt/models/whisper/writers.py ADDED Viewed

@@ -0,0 +1,268 @@
+# Copyright © 2024 Apple Inc.
+import json
+import pathlib
+import re
+from typing import Callable, List, Optional, TextIO
+def format_timestamp(
+    seconds: float, always_include_hours: bool = False, decimal_marker: str = "."
+):
+    assert seconds >= 0, "non-negative timestamp expected"
+    milliseconds = round(seconds * 1000.0)
+    hours = milliseconds // 3_600_000
+    milliseconds -= hours * 3_600_000
+    minutes = milliseconds // 60_000
+    milliseconds -= minutes * 60_000
+    seconds = milliseconds // 1_000
+    milliseconds -= seconds * 1_000
+    hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
+    return (
+        f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
+    )
+def get_start(segments: List[dict]) -> Optional[float]:
+    return next(
+        (w["start"] for s in segments for w in s["words"]),
+        segments[0]["start"] if segments else None,
+    )
+class ResultWriter:
+    extension: str
+    def __init__(self, output_dir: str):
+        self.output_dir = output_dir
+    def __call__(
+        self, result: dict, output_name: str, options: Optional[dict] = None, **kwargs
+    ):
+        output_path = (pathlib.Path(self.output_dir) / output_name).with_suffix(
+            f".{self.extension}"
+        )
+        with output_path.open("wt", encoding="utf-8") as f:
+            self.write_result(result, file=f, options=options, **kwargs)
+    def write_result(
+        self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
+    ):
+        raise NotImplementedError
+class WriteTXT(ResultWriter):
+    extension: str = "txt"
+    def write_result(
+        self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
+    ):
+        for segment in result["segments"]:
+            print(segment["text"].strip(), file=file, flush=True)
+class SubtitlesWriter(ResultWriter):
+    always_include_hours: bool
+    decimal_marker: str
+    def iterate_result(
+        self,
+        result: dict,
+        options: Optional[dict] = None,
+        *,
+        max_line_width: Optional[int] = None,
+        max_line_count: Optional[int] = None,
+        highlight_words: bool = False,
+        max_words_per_line: Optional[int] = None,
+    ):
+        options = options or {}
+        max_line_width = max_line_width or options.get("max_line_width")
+        max_line_count = max_line_count or options.get("max_line_count")
+        highlight_words = highlight_words or options.get("highlight_words", False)
+        max_words_per_line = max_words_per_line or options.get("max_words_per_line")
+        preserve_segments = max_line_count is None or max_line_width is None
+        max_line_width = max_line_width or 1000
+        max_words_per_line = max_words_per_line or 1000
+        def iterate_subtitles():
+            line_len = 0
+            line_count = 1
+            # the next subtitle to yield (a list of word timings with whitespace)
+            subtitle: List[dict] = []
+            last: float = get_start(result["segments"]) or 0.0
+            for segment in result["segments"]:
+                chunk_index = 0
+                words_count = max_words_per_line
+                while chunk_index < len(segment["words"]):
+                    remaining_words = len(segment["words"]) - chunk_index
+                    if max_words_per_line > len(segment["words"]) - chunk_index:
+                        words_count = remaining_words
+                    for i, original_timing in enumerate(
+                        segment["words"][chunk_index : chunk_index + words_count]
+                    ):
+                        timing = original_timing.copy()
+                        long_pause = (
+                            not preserve_segments and timing["start"] - last > 3.0
+                        )
+                        has_room = line_len + len(timing["word"]) <= max_line_width
+                        seg_break = i == 0 and len(subtitle) > 0 and preserve_segments
+                        if (
+                            line_len > 0
+                            and has_room
+                            and not long_pause
+                            and not seg_break
+                        ):
+                            # line continuation
+                            line_len += len(timing["word"])
+                        else:
+                            # new line
+                            timing["word"] = timing["word"].strip()
+                            if (
+                                len(subtitle) > 0
+                                and max_line_count is not None
+                                and (long_pause or line_count >= max_line_count)
+                                or seg_break
+                            ):
+                                # subtitle break
+                                yield subtitle
+                                subtitle = []
+                                line_count = 1
+                            elif line_len > 0:
+                                # line break
+                                line_count += 1
+                                timing["word"] = "\n" + timing["word"]
+                            line_len = len(timing["word"].strip())
+                        subtitle.append(timing)
+                        last = timing["start"]
+                    chunk_index += max_words_per_line
+            if len(subtitle) > 0:
+                yield subtitle
+        if len(result["segments"]) > 0 and "words" in result["segments"][0]:
+            for subtitle in iterate_subtitles():
+                subtitle_start = self.format_timestamp(subtitle[0]["start"])
+                subtitle_end = self.format_timestamp(subtitle[-1]["end"])
+                subtitle_text = "".join([word["word"] for word in subtitle])
+                if highlight_words:
+                    last = subtitle_start
+                    all_words = [timing["word"] for timing in subtitle]
+                    for i, this_word in enumerate(subtitle):
+                        start = self.format_timestamp(this_word["start"])
+                        end = self.format_timestamp(this_word["end"])
+                        if last != start:
+                            yield last, start, subtitle_text
+                        yield start, end, "".join(
+                            [
+                                (
+                                    re.sub(r"^(\s*)(.*)$", r"\1<u>\2</u>", word)
+                                    if j == i
+                                    else word
+                                )
+                                for j, word in enumerate(all_words)
+                            ]
+                        )
+                        last = end
+                else:
+                    yield subtitle_start, subtitle_end, subtitle_text
+        else:
+            for segment in result["segments"]:
+                segment_start = self.format_timestamp(segment["start"])
+                segment_end = self.format_timestamp(segment["end"])
+                segment_text = segment["text"].strip().replace("-->", "->")
+                yield segment_start, segment_end, segment_text
+    def format_timestamp(self, seconds: float):
+        return format_timestamp(
+            seconds=seconds,
+            always_include_hours=self.always_include_hours,
+            decimal_marker=self.decimal_marker,
+        )
+class WriteVTT(SubtitlesWriter):
+    extension: str = "vtt"
+    always_include_hours: bool = False
+    decimal_marker: str = "."
+    def write_result(
+        self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
+    ):
+        print("WEBVTT\n", file=file)
+        for start, end, text in self.iterate_result(result, options, **kwargs):
+            print(f"{start} --> {end}\n{text}\n", file=file, flush=True)
+class WriteSRT(SubtitlesWriter):
+    extension: str = "srt"
+    always_include_hours: bool = True
+    decimal_marker: str = ","
+    def write_result(
+        self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
+    ):
+        for i, (start, end, text) in enumerate(
+            self.iterate_result(result, options, **kwargs), start=1
+        ):
+            print(f"{i}\n{start} --> {end}\n{text}\n", file=file, flush=True)
+class WriteTSV(ResultWriter):
+    """
+    Write a transcript to a file in TSV (tab-separated values) format containing lines like:
+    <start time in integer milliseconds>\t<end time in integer milliseconds>\t<transcript text>
+    Using integer milliseconds as start and end times means there's no chance of interference from
+    an environment setting a language encoding that causes the decimal in a floating point number
+    to appear as a comma; also is faster and more efficient to parse & store, e.g., in C++.
+    """
+    extension: str = "tsv"
+    def write_result(
+        self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
+    ):
+        print("start", "end", "text", sep="\t", file=file)
+        for segment in result["segments"]:
+            print(round(1000 * segment["start"]), file=file, end="\t")
+            print(round(1000 * segment["end"]), file=file, end="\t")
+            print(segment["text"].strip().replace("\t", " "), file=file, flush=True)
+class WriteJSON(ResultWriter):
+    extension: str = "json"
+    def write_result(
+        self, result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
+    ):
+        json.dump(result, file, ensure_ascii=False)
+def get_writer(
+    output_format: str, output_dir: str
+) -> Callable[[dict, TextIO, dict], None]:
+    writers = {
+        "txt": WriteTXT,
+        "vtt": WriteVTT,
+        "srt": WriteSRT,
+        "tsv": WriteTSV,
+        "json": WriteJSON,
+    }
+    if output_format == "all":
+        all_writers = [writer(output_dir) for writer in writers.values()]
+        def write_all(
+            result: dict, file: TextIO, options: Optional[dict] = None, **kwargs
+        ):
+            for writer in all_writers:
+                writer(result, file, options, **kwargs)
+        return write_all
+    return writers[output_format](output_dir)

nexaai/mlx_backend/mlx_audio/stt/tests/test_models.py ADDED Viewed

@@ -0,0 +1,381 @@
+import json
+import unittest
+from pathlib import Path
+from unittest.mock import ANY, MagicMock, PropertyMock, patch
+import mlx.core as mx
+import mlx.nn as nn
+import numpy as np
+from mlx_audio.stt.models.parakeet.parakeet import ParakeetTDT
+from mlx_audio.stt.models.whisper.audio import (
+    HOP_LENGTH,
+    N_FRAMES,
+    N_SAMPLES,
+    SAMPLE_RATE,
+)
+from mlx_audio.stt.models.whisper.decoding import DecodingOptions, DecodingResult
+from mlx_audio.stt.models.whisper.whisper import Model, ModelDimensions, STTOutput
+class TestWhisperModel(unittest.TestCase):
+    def setUp(self):
+        self.dims = ModelDimensions(
+            n_mels=80,
+            n_audio_ctx=1500,
+            n_audio_state=384,
+            n_audio_head=6,
+            n_audio_layer=4,
+            n_vocab=51864,
+            n_text_ctx=448,
+            n_text_state=384,
+            n_text_head=6,
+            n_text_layer=4,
+        )
+        self.model_mock = MagicMock(spec=Model, name="MockModelInstance")
+        self.model_mock.dims = self.dims
+        self.model_mock.dtype = mx.float32
+        type(self.model_mock).is_multilingual = PropertyMock(return_value=False)
+        type(self.model_mock).num_languages = PropertyMock(return_value=0)
+    @patch("mlx_audio.stt.models.whisper.whisper.Path")
+    @patch("mlx_audio.stt.models.whisper.whisper.snapshot_download")
+    @patch("mlx_audio.stt.models.whisper.whisper.mx.load")
+    @patch("mlx_audio.stt.models.whisper.whisper.json.loads")
+    @patch("builtins.open", new_callable=MagicMock)
+    def test_from_pretrained(
+        self,
+        mock_open,
+        mock_json_loads_in_whisper,
+        mock_mx_load,
+        mock_snapshot_download,
+        mock_pathlib_path,
+    ):
+        mock_snapshot_download.return_value = "dummy_path"
+        mock_paths_registry = {}
+        def path_constructor_side_effect(path_str_arg):
+            if path_str_arg in mock_paths_registry:
+                return mock_paths_registry[path_str_arg]
+            new_mock_path = MagicMock(spec=Path)
+            new_mock_path.__str__.return_value = str(path_str_arg)
+            if str(path_str_arg) == "dummy_path/weights.safetensors":
+                new_mock_path.exists.return_value = True
+            elif str(path_str_arg) == "dummy_path":
+                new_mock_path.exists.return_value = True
+            else:
+                new_mock_path.exists.return_value = False
+            def mock_truediv(other_segment):
+                concatenated_path_str = f"{str(path_str_arg)}/{other_segment}"
+                return path_constructor_side_effect(concatenated_path_str)
+            new_mock_path.__truediv__.side_effect = mock_truediv
+            new_mock_path.__rtruediv__ = mock_truediv
+            mock_paths_registry[path_str_arg] = new_mock_path
+            return new_mock_path
+        mock_pathlib_path.side_effect = path_constructor_side_effect
+        dummy_config = {
+            "n_mels": 80,
+            "n_audio_ctx": 1500,
+            "n_audio_state": 384,
+            "n_audio_head": 6,
+            "n_audio_layer": 4,
+            "n_vocab": 51865,
+            "n_text_ctx": 448,
+            "n_text_state": 384,
+            "n_text_head": 6,
+            "n_text_layer": 4,
+        }
+        mock_open.return_value.__enter__.return_value.read.return_value = json.dumps(
+            dummy_config
+        )
+        mock_json_loads_in_whisper.return_value = dummy_config
+        dummy_weights = {
+            "encoder.conv1.weight": mx.random.normal((384, 80, 3)),
+            "encoder.conv1.bias": mx.random.normal((384,)),
+        }
+        mock_mx_load.return_value = dummy_weights
+        model_instance = Model.from_pretrained(
+            path_or_hf_repo="mlx-community/whisper-tiny", dtype=mx.float32
+        )
+        self.assertIsInstance(model_instance, Model)
+        self.assertEqual(model_instance.dims.n_mels, dummy_config["n_mels"])
+        mock_snapshot_download.assert_called_once_with(
+            repo_id="mlx-community/whisper-tiny"
+        )
+        mock_open.assert_called_once_with("dummy_path/config.json", "r")
+        mock_mx_load.assert_called_once_with("dummy_path/weights.safetensors")
+    @patch("mlx_audio.stt.models.whisper.whisper.pad_or_trim")
+    @patch("mlx_audio.stt.models.whisper.whisper.tqdm.tqdm")
+    @patch("mlx_audio.stt.models.whisper.whisper.get_tokenizer")
+    @patch("mlx_audio.stt.models.whisper.whisper.log_mel_spectrogram")
+    def test_generate_simple_case(
+        self,
+        mock_log_mel,
+        mock_get_tokenizer,
+        mock_tqdm_tqdm,
+        mock_pad_or_trim,
+    ):
+        """Test model.generate for a simple case with one segment."""
+        mock_mel_data = mx.zeros((N_FRAMES + 100, self.dims.n_mels), dtype=mx.float32)
+        mock_log_mel.return_value = mock_mel_data
+        EOT_TOKEN_ID = 50257
+        TIMESTAMP_BEGIN_ID = 50364
+        mock_tokenizer_inst = MagicMock(
+            name="mock_tokenizer_instance_for_test",
+            eot=EOT_TOKEN_ID,
+            timestamp_begin=TIMESTAMP_BEGIN_ID,
+        )
+        def actual_decode_side_effect(tokens_to_decode):
+            text_parts = []
+            for token_val in tokens_to_decode:
+                t = int(token_val)
+                if t == 100:
+                    text_parts.append("hello")
+                elif t == 200:
+                    text_parts.append("world")
+                elif t == EOT_TOKEN_ID:
+                    break
+            return " ".join(text_parts) if text_parts else ""
+        mock_tokenizer_inst.decode.side_effect = actual_decode_side_effect
+        mock_tokenizer_inst.encode.return_value = []
+        mock_get_tokenizer.return_value = mock_tokenizer_inst
+        decoded_tokens_list = [100, 200, EOT_TOKEN_ID]
+        mock_decoding_result = DecodingResult(
+            tokens=mx.array(decoded_tokens_list),
+            temperature=0.0,
+            avg_logprob=-0.25,
+            compression_ratio=1.2,
+            no_speech_prob=0.05,
+            audio_features=mx.zeros((1, self.dims.n_mels), dtype=mx.float32),
+            language="en",
+        )
+        mock_pbar = MagicMock()
+        mock_pbar.update = MagicMock()
+        mock_tqdm_constructor = MagicMock()
+        mock_tqdm_constructor.return_value.__enter__.return_value = mock_pbar
+        mock_tqdm_constructor.return_value.__exit__ = MagicMock()
+        mock_tqdm_tqdm.side_effect = mock_tqdm_constructor
+        def pad_or_trim_side_effect(array, length, axis):
+            return mx.zeros((length, array.shape[1]), dtype=array.dtype)
+        mock_pad_or_trim.side_effect = pad_or_trim_side_effect
+        dummy_audio_input = np.zeros(SAMPLE_RATE * 1, dtype=np.float32)
+        real_model_for_test = Model(self.dims, dtype=mx.float32)
+        # Patch this specific instance's 'decode' method
+        with patch.object(
+            real_model_for_test, "decode", return_value=mock_decoding_result
+        ) as mock_instance_decode:
+            output = real_model_for_test.generate(
+                dummy_audio_input,
+                language="en",
+                word_timestamps=False,
+                temperature=0.0,
+                fp16=False,
+            )
+            mock_instance_decode.assert_called_once()
+            args_decode_call, _ = mock_instance_decode.call_args
+            self.assertEqual(
+                args_decode_call[0].shape, (N_FRAMES, self.dims.n_mels)
+            )  # mel_segment
+            self.assertIsInstance(args_decode_call[1], DecodingOptions)
+            self.assertEqual(args_decode_call[1].language, "en")
+            self.assertEqual(args_decode_call[1].fp16, False)
+        self.assertIsInstance(output, STTOutput)
+        self.assertEqual(output.language, "en")
+        expected_text_output = "hello world"
+        self.assertEqual(output.text, expected_text_output)  #
+        self.assertIsInstance(output.segments, list)
+        self.assertEqual(len(output.segments), 1, "Should produce one segment")
+        segment = output.segments[0]
+        self.assertEqual(segment["text"], expected_text_output)
+        self.assertEqual(segment["tokens"], decoded_tokens_list)
+        self.assertEqual(segment["seek"], 0)
+        self.assertAlmostEqual(segment["start"], 0.0)
+        self.assertAlmostEqual(segment["end"], 1.0)
+        self.assertEqual(segment["temperature"], mock_decoding_result.temperature)
+        self.assertAlmostEqual(segment["avg_logprob"], mock_decoding_result.avg_logprob)
+        self.assertAlmostEqual(
+            segment["compression_ratio"], mock_decoding_result.compression_ratio
+        )
+        self.assertAlmostEqual(
+            segment["no_speech_prob"], mock_decoding_result.no_speech_prob
+        )
+        mock_log_mel.assert_called_once_with(
+            ANY, n_mels=self.dims.n_mels, padding=N_SAMPLES
+        )
+        np.testing.assert_array_equal(mock_log_mel.call_args[0][0], dummy_audio_input)
+        mock_get_tokenizer.assert_called_once_with(
+            real_model_for_test.is_multilingual,  # Reads from the instance
+            num_languages=real_model_for_test.num_languages,  # Reads from the instance
+            language="en",
+            task="transcribe",
+        )
+        mock_pad_or_trim.assert_called_once()
+        args_pad_call, _ = mock_pad_or_trim.call_args
+        self.assertEqual(args_pad_call[0].shape, (100, self.dims.n_mels))
+        self.assertEqual(args_pad_call[1], N_FRAMES)
+class TestParakeetModel(unittest.TestCase):
+    @patch("mlx.nn.Module.load_weights")
+    @patch("mlx_audio.stt.models.parakeet.parakeet.hf_hub_download")
+    @patch("mlx_audio.stt.models.parakeet.parakeet.json.load")
+    @patch("mlx_audio.stt.models.parakeet.parakeet.open", new_callable=MagicMock)
+    @patch("mlx.core.load")
+    def test_parakeet_tdt_from_pretrained(
+        self,
+        mock_mlx_core_load,
+        mock_parakeet_module_open,
+        mock_parakeet_json_load,
+        mock_hf_hub_download,
+        mock_module_load_weights,
+    ):
+        """Test ParakeetTDT.from_pretrained method."""
+        dummy_repo_id = "dummy/parakeet-tdt-model"
+        dummy_config_path = "dummy_path/config.json"
+        dummy_weights_path = "dummy_path/model.safetensors"
+        # Configure hf_hub_download
+        def hf_hub_download_side_effect(repo_id_arg, filename_arg):
+            if repo_id_arg == dummy_repo_id and filename_arg == "config.json":
+                return dummy_config_path
+            if repo_id_arg == dummy_repo_id and filename_arg == "model.safetensors":
+                return dummy_weights_path
+            raise ValueError(
+                f"Unexpected hf_hub_download call: {repo_id_arg}, {filename_arg}"
+            )
+        mock_hf_hub_download.side_effect = hf_hub_download_side_effect
+        # Dummy config content
+        dummy_vocabulary = [" ", "a", "b", "c"]
+        dummy_config_dict = {
+            "target": "nemo.collections.asr.models.rnnt_bpe_models.EncDecRNNTBPEModel",
+            "model_defaults": {"tdt_durations": [0, 1, 2, 3]},
+            "preprocessor": {
+                "sample_rate": 16000,
+                "normalize": "per_feature",
+                "window_size": 0.02,
+                "window_stride": 0.01,
+                "window": "hann",
+                "features": 80,
+                "n_fft": 512,
+                "dither": 1e-05,
+                "pad_to": 0,
+                "pad_value": 0.0,
+            },
+            "encoder": {
+                "feat_in": 80,
+                "n_layers": 17,
+                "d_model": 512,
+                "conv_dim": 512,
+                "n_heads": 8,
+                "self_attention_model": "rel_pos",
+                "subsampling": "dw_striding",
+                "causal_downsampling": False,
+                "pos_emb_max_len": 5000,
+                "ff_expansion_factor": 4,
+                "subsampling_factor": 4,
+                "subsampling_conv_channels": 512,
+                "dropout_rate": 0.1,
+                "attention_dropout_rate": 0.1,
+                "conv_dropout_rate": 0.1,
+                "conv_kernel_size": 31,
+                "causal_depthwise_conv": False,
+            },
+            "decoder": {
+                "blank_as_pad": True,
+                "vocab_size": len(dummy_vocabulary),
+                "input_dim": 512,
+                "hidden_dim": 512,
+                "output_dim": 1024,
+                "num_layers": 1,
+                "dropout_rate": 0.1,
+                "prednet": {
+                    "input_dim": 512,
+                    "pred_hidden": 512,
+                    "output_dim": 1024,
+                    "pred_rnn_layers": 1,
+                    "dropout_rate": 0.1,
+                },
+            },
+            "joint": {
+                "input_dim_encoder": 512,
+                "input_dim_decoder": 1024,
+                "num_classes": len(dummy_vocabulary) + 1,
+                "joint_dropout_rate": 0.1,
+                "vocabulary": dummy_vocabulary,
+                "jointnet": {
+                    "encoder_hidden": 512,
+                    "pred_hidden": 1024,
+                    "joint_hidden": 512,
+                    "activation": "relu",
+                },
+            },
+            "decoding": {
+                "model_type": "tdt",
+                "durations": [0, 1, 2, 3],
+                "greedy": {"max_symbols": 10},
+            },
+        }
+        # Configure mocks for config loading
+        mock_file_object_for_context_manager = (
+            MagicMock()
+        )  # This is what __enter__ would return
+        mock_parakeet_module_open.return_value.__enter__.return_value = (
+            mock_file_object_for_context_manager
+        )
+        # If open is used not as a context manager, its direct return value is the file handle
+        # json.load will be called with mock_parakeet_module_open.return_value
+        mock_parakeet_json_load.return_value = dummy_config_dict
+        mock_mlx_core_load.return_value = {"some.valid.path.if.needed": mx.array([0.0])}
+        model = ParakeetTDT.from_pretrained(dummy_repo_id, dtype=mx.float32)
+        self.assertIsInstance(model, ParakeetTDT)
+        mock_hf_hub_download.assert_any_call(dummy_repo_id, "config.json")
+        mock_hf_hub_download.assert_any_call(dummy_repo_id, "model.safetensors")
+        self.assertEqual(model.preprocessor_config.sample_rate, 16000)
+        self.assertEqual(model.preprocessor_config.features, 80)
+        self.assertEqual(
+            model.encoder_config.d_model, 512
+        )  # d_model is correct for ConformerArgs
+        self.assertEqual(model.vocabulary, dummy_vocabulary)
+        self.assertEqual(model.durations, [0, 1, 2, 3])
+if __name__ == "__main__":
+    unittest.main()