PyPI - nexaai - Versions diffs - 1.0.29__cp310-cp310-macosx_14_0_universal2.whl - Mend

nexaai 1.0.29__cp310-cp310-macosx_14_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (580) hide show

nexaai/mlx_backend/mlx_audio/tts/models/outetts/dac_interface.py ADDED Viewed

@@ -0,0 +1,162 @@
+import math
+import mlx.core as mx
+import numpy as np
+import pyloudnorm as pyln
+import scipy.signal
+import soundfile as sf
+from mlx_audio.codec import DAC
+def process_audio_array(
+    audio: mx.array,
+    sample_rate: int = 24000,
+    target_loudness: float = -18.0,
+    peak_limit: float = -1,
+    block_size: float = 0.400,
+) -> mx.array:
+    audio_np = np.array(audio)
+    # handle multi-channel audio
+    if len(audio_np.shape) > 1:
+        if audio_np.shape[1] > 1:
+            audio_np = np.mean(audio_np, axis=1)
+        else:
+            audio_np = np.squeeze(audio_np)
+    original_length = len(audio_np)
+    min_samples = int(block_size * sample_rate)
+    if original_length < min_samples:
+        pad_length = min_samples - original_length
+        audio_padded = np.pad(audio_np, (0, pad_length), mode="constant")
+    else:
+        audio_padded = audio_np
+    # measure and normalize loudness
+    meter = pyln.Meter(sample_rate, block_size=block_size)
+    measured_loudness = meter.integrated_loudness(audio_padded)
+    normalized = pyln.normalize.loudness(
+        audio_padded, measured_loudness, target_loudness
+    )
+    # apply peak limiting if necessary
+    peak_value = np.max(np.abs(normalized))
+    threshold_value = 10 ** (peak_limit / 20)
+    if peak_value > threshold_value:
+        normalized = pyln.normalize.peak(normalized, peak_limit)
+    if original_length < min_samples:
+        normalized = normalized[:original_length]
+    normalized_array = mx.array(normalized).reshape(1, 1, -1)
+    return normalized_array
+class DacInterface:
+    def __init__(self, repo_id: str = "mlx-community/dac-speech-24khz-1.5kbps"):
+        self.model = DAC.from_pretrained(repo_id)
+        self.sr = 24000
+    def convert_audio(
+        self, audio: mx.array, sr: int, target_sr: int, target_channels: int
+    ):
+        audio_np = np.array(audio)
+        if len(audio_np.shape) < 2:
+            audio_np = audio_np.reshape(1, -1)
+        channels, length = audio_np.shape[-2:]
+        if target_channels == 1:
+            if channels > 1:
+                audio_np = np.mean(audio_np, axis=-2, keepdims=True)
+        elif target_channels == 2:
+            if channels == 1:
+                audio_np = np.repeat(audio_np, 2, axis=-2)
+            elif channels > 2:
+                audio_np = audio_np[..., :2, :]
+        if sr != target_sr:
+            new_length = int(length * target_sr / sr)
+            resampled = np.zeros((target_channels, new_length))
+            for ch in range(target_channels):
+                resampled[ch] = scipy.signal.resample(audio_np[ch], new_length)
+            audio_np = resampled
+        return mx.array(audio_np)
+    def convert_audio_array(self, audio: mx.array, sr):
+        return self.convert_audio(audio, sr, self.sr, 1)
+    def load_audio(self, path):
+        audio_np, sr = sf.read(path)
+        audio = mx.array(audio_np)
+        if len(audio.shape) == 1:
+            audio = audio.reshape(1, -1)
+        # if stereo, reshape to channels-first format
+        elif len(audio.shape) > 1 and audio.shape[0] > audio.shape[1]:
+            audio = audio.T
+        return self.convert_audio_array(audio, sr).reshape(1, 1, -1)
+    def preprocess(self, audio_data):
+        length = audio_data.shape[-1]
+        hop_length = self.model.hop_length
+        right_pad = math.ceil(length / hop_length) * hop_length - length
+        audio_data = mx.pad(audio_data, [(0, 0), (0, 0), (0, right_pad)])
+        return audio_data
+    def encode(self, x: mx.array, win_duration: int = 5.0, verbose: bool = False):
+        x = process_audio_array(x)
+        nb, nac, nt = x.shape
+        x = x.reshape(nb * nac, 1, nt)
+        n_samples = int(win_duration * self.sr)
+        n_samples = int(
+            math.ceil(n_samples / self.model.hop_length) * self.model.hop_length
+        )
+        hop = n_samples
+        codes_list = []
+        if verbose:
+            from tqdm import trange
+            range_fn = trange
+        else:
+            range_fn = range
+        for i in range_fn(0, nt, hop):
+            chunk = x[..., i : i + n_samples]
+            audio_data = self.preprocess(chunk)
+            _, c, _, _, _ = self.model.encode(audio_data, None)
+            codes_list.append(c)
+        codes = mx.concatenate(codes_list, axis=-1)
+        return codes
+    def decode(self, codes: mx.array, verbose: bool = False) -> mx.array:
+        model = self.model
+        chunk_length = 4096
+        recons = []
+        if verbose:
+            from tqdm import trange
+            range_fn = trange
+        else:
+            range_fn = range
+        @mx.compile
+        def decode_chunk(codes):
+            z = model.quantizer.from_codes(codes)[0]
+            r = model.decode(z)
+            return r
+        for i in range_fn(0, codes.shape[-1], chunk_length):
+            c = codes[..., i : i + chunk_length]
+            recons.append(decode_chunk(c))
+        recons = mx.concatenate(recons, axis=-1)
+        return process_audio_array(recons.swapaxes(1, 2))

nexaai/mlx_backend/mlx_audio/tts/models/outetts/default_speaker.json ADDED Viewed

@@ -0,0 +1,461 @@
+{
+  "text": "The cat watched from the windowsill, tail flicking with quiet curiosity as the first snowflakes of winter began to fall, dusting the world in fragile white.",
+  "words": [
+    {
+      "word": "The",
+      "duration": 0.2,
+      "c1": [
+        720, 720, 474, 691, 607, 126, 597, 607, 897, 288, 362, 903, 333, 1009,
+        79
+      ],
+      "c2": [
+        658, 663, 237, 915, 74, 74, 966, 721, 893, 722, 630, 516, 861, 385, 149
+      ],
+      "features": {
+        "energy": 10,
+        "spectral_centroid": 15,
+        "pitch": 45
+      }
+    },
+    {
+      "word": "cat",
+      "duration": 0.33,
+      "c1": [
+        700, 597, 639, 838, 622, 336, 975, 326, 67, 375, 853, 761, 35, 363, 31,
+        1000, 982, 192, 647, 564, 329, 1002, 275, 480, 551
+      ],
+      "c2": [
+        34, 810, 457, 546, 42, 631, 339, 867, 115, 1011, 509, 369, 473, 85, 190,
+        715, 391, 518, 562, 986, 749, 193, 530, 327, 820
+      ],
+      "features": {
+        "energy": 14,
+        "spectral_centroid": 21,
+        "pitch": 35
+      }
+    },
+    {
+      "word": "watched",
+      "duration": 0.44,
+      "c1": [
+        625, 668, 168, 524, 462, 151, 549, 951, 597, 820, 489, 329, 377, 144,
+        112, 16, 481, 133, 195, 744, 144, 750, 288, 500, 1000, 58, 916, 597, 72,
+        336, 224, 476, 581
+      ],
+      "c2": [
+        204, 421, 318, 677, 74, 953, 903, 413, 809, 37, 634, 824, 933, 200, 14,
+        1007, 111, 17, 435, 718, 559, 783, 415, 821, 958, 247, 14, 721, 158,
+        235, 276, 875, 683
+      ],
+      "features": {
+        "energy": 19,
+        "spectral_centroid": 21,
+        "pitch": 26
+      }
+    },
+    {
+      "word": "from",
+      "duration": 0.2,
+      "c1": [
+        528, 668, 738, 985, 126, 924, 1003, 325, 393, 86, 114, 392, 638, 915,
+        549
+      ],
+      "c2": [
+        929, 872, 332, 296, 983, 406, 867, 568, 374, 328, 419, 348, 177, 379,
+        181
+      ],
+      "features": {
+        "energy": 10,
+        "spectral_centroid": 29,
+        "pitch": 14
+      }
+    },
+    {
+      "word": "the",
+      "duration": 0.12,
+      "c1": [470, 985, 152, 474, 967, 558, 460, 728, 470],
+      "c2": [596, 246, 314, 246, 756, 238, 606, 262, 499],
+      "features": {
+        "energy": 23,
+        "spectral_centroid": 10,
+        "pitch": 23
+      }
+    },
+    {
+      "word": "windowsill,",
+      "duration": 0.75,
+      "c1": [
+        217, 126, 549, 700, 198, 891, 95, 683, 158, 680, 16, 769, 402, 776, 295,
+        258, 68, 213, 669, 865, 719, 29, 949, 329, 216, 481, 284, 224, 221, 359,
+        328, 311, 415, 443, 410, 359, 600, 590, 932, 611, 905, 304, 292, 72,
+        388, 333, 66, 943, 489, 648, 630, 648, 402, 972, 392, 558
+      ],
+      "c2": [
+        911, 19, 1007, 169, 185, 182, 399, 849, 656, 963, 265, 80, 453, 768,
+        919, 1010, 501, 794, 141, 123, 93, 694, 499, 174, 768, 689, 598, 686,
+        10, 381, 282, 556, 126, 672, 872, 650, 990, 556, 913, 635, 174, 819,
+        999, 423, 64, 272, 112, 600, 453, 678, 791, 301, 206, 187, 819, 948
+      ],
+      "features": {
+        "energy": 17,
+        "spectral_centroid": 25,
+        "pitch": 24
+      }
+    },
+    {
+      "word": "tail",
+      "duration": 0.6,
+      "c1": [
+        669, 94, 917, 202, 607, 720, 625, 597, 126, 607, 885, 700, 474, 480,
+        126, 126, 551, 720, 126, 551, 720, 607, 572, 234, 114, 963, 963, 975,
+        587, 119, 378, 696, 730, 375, 46, 827, 515, 447, 979, 138, 22, 267, 43,
+        495, 16
+      ],
+      "c2": [
+        1011, 336, 157, 39, 1000, 721, 862, 413, 557, 569, 74, 569, 141, 493,
+        124, 775, 204, 588, 74, 588, 810, 124, 102, 1021, 83, 848, 297, 339,
+        335, 684, 400, 905, 909, 710, 460, 115, 81, 628, 224, 663, 892, 247,
+        392, 234, 132
+      ],
+      "features": {
+        "energy": 15,
+        "spectral_centroid": 23,
+        "pitch": 34
+      }
+    },
+    {
+      "word": "flicking",
+      "duration": 0.45,
+      "c1": [
+        978, 489, 630, 588, 436, 798, 4, 975, 245, 325, 415, 4, 393, 4, 4, 997,
+        982, 437, 444, 180, 861, 868, 225, 440, 780, 597, 720, 639, 168, 426,
+        114, 621, 854, 869
+      ],
+      "c2": [
+        571, 321, 376, 232, 301, 678, 904, 630, 990, 772, 690, 870, 719, 694,
+        332, 558, 301, 194, 279, 443, 852, 64, 709, 401, 401, 14, 74, 873, 134,
+        754, 1002, 595, 540, 525
+      ],
+      "features": {
+        "energy": 9,
+        "spectral_centroid": 22,
+        "pitch": 23
+      }
+    },
+    {
+      "word": "with",
+      "duration": 0.23,
+      "c1": [
+        621, 392, 756, 459, 433, 881, 786, 198, 702, 847, 490, 27, 680, 146, 58,
+        808, 997
+      ],
+      "c2": [
+        460, 840, 840, 303, 847, 534, 801, 99, 662, 666, 510, 132, 376, 96, 639,
+        240, 668
+      ],
+      "features": {
+        "energy": 11,
+        "spectral_centroid": 15,
+        "pitch": 20
+      }
+    },
+    {
+      "word": "quiet",
+      "duration": 0.37,
+      "c1": [
+        969, 291, 572, 720, 625, 85, 698, 478, 811, 956, 232, 85, 962, 817, 986,
+        483, 835, 526, 77, 187, 178, 50, 440, 16, 198, 237, 418, 862
+      ],
+      "c2": [
+        498, 606, 24, 629, 662, 181, 119, 678, 340, 736, 217, 204, 935, 796,
+        118, 478, 818, 791, 329, 209, 5, 234, 337, 647, 110, 922, 933, 1011
+      ],
+      "features": {
+        "energy": 12,
+        "spectral_centroid": 12,
+        "pitch": 43
+      }
+    },
+    {
+      "word": "curiosity",
+      "duration": 0.71,
+      "c1": [
+        321, 402, 215, 607, 720, 224, 731, 621, 491, 720, 551, 456, 336, 688,
+        476, 953, 718, 806, 410, 786, 976, 664, 855, 433, 756, 396, 699, 776,
+        443, 739, 932, 22, 305, 353, 503, 564, 978, 407, 395, 798, 324, 168,
+        909, 328, 328, 443, 738, 114, 962, 681, 535, 701, 382
+      ],
+      "c2": [
+        777, 665, 629, 327, 831, 764, 162, 725, 810, 170, 629, 774, 108, 948,
+        972, 449, 600, 905, 81, 765, 601, 422, 820, 746, 450, 346, 733, 77, 733,
+        81, 722, 576, 286, 271, 714, 95, 346, 133, 514, 799, 122, 900, 568, 666,
+        209, 668, 558, 630, 165, 587, 423, 904, 629
+      ],
+      "features": {
+        "energy": 10,
+        "spectral_centroid": 29,
+        "pitch": 22
+      }
+    },
+    {
+      "word": "as",
+      "duration": 0.48,
+      "c1": [
+        474, 936, 336, 589, 254, 854, 79, 140, 863, 854, 701, 260, 929, 140,
+        669, 808, 411, 232, 434, 542, 597, 126, 551, 126, 607, 1011, 774, 681,
+        94, 25, 971, 288, 305, 347, 355, 415
+      ],
+      "c2": [
+        267, 813, 232, 361, 77, 607, 252, 933, 508, 658, 846, 849, 873, 496,
+        832, 167, 440, 124, 557, 124, 736, 588, 569, 983, 497, 360, 810, 274,
+        588, 365, 517, 934, 957, 839, 646, 720
+      ],
+      "features": {
+        "energy": 7,
+        "spectral_centroid": 31,
+        "pitch": 23
+      }
+    },
+    {
+      "word": "the",
+      "duration": 0.13,
+      "c1": [359, 568, 700, 985, 80, 580, 274, 129, 600, 794],
+      "c2": [423, 833, 245, 690, 209, 688, 765, 453, 677, 615],
+      "features": {
+        "energy": 9,
+        "spectral_centroid": 26,
+        "pitch": 20
+      }
+    },
+    {
+      "word": "first",
+      "duration": 0.36,
+      "c1": [
+        997, 325, 147, 4, 780, 669, 621, 896, 30, 686, 526, 399, 210, 783, 216,
+        144, 329, 448, 481, 288, 132, 600, 168, 221, 415, 415, 528
+      ],
+      "c2": [
+        325, 666, 627, 629, 240, 665, 650, 481, 962, 328, 128, 358, 166, 264,
+        555, 30, 815, 10, 669, 525, 450, 746, 919, 621, 647, 16, 601
+      ],
+      "features": {
+        "energy": 13,
+        "spectral_centroid": 28,
+        "pitch": 22
+      }
+    },
+    {
+      "word": "snowflakes",
+      "duration": 0.76,
+      "c1": [
+        1003, 680, 607, 720, 126, 668, 336, 224, 114, 997, 426, 997, 147, 221,
+        359, 328, 1003, 738, 974, 151, 782, 179, 190, 553, 453, 761, 778, 23,
+        128, 643, 125, 7, 345, 223, 275, 524, 325, 764, 114, 953, 70, 75, 449,
+        513, 783, 830, 825, 365, 819, 920, 669, 700, 700, 720, 220, 209, 221
+      ],
+      "c2": [
+        276, 489, 810, 975, 775, 913, 1022, 818, 340, 481, 690, 366, 924, 782,
+        366, 481, 400, 998, 872, 556, 688, 719, 78, 952, 119, 412, 286, 847, 60,
+        381, 86, 694, 779, 55, 246, 374, 143, 91, 209, 640, 313, 873, 295, 355,
+        333, 705, 468, 1008, 317, 87, 105, 511, 260, 650, 574, 88, 690
+      ],
+      "features": {
+        "energy": 12,
+        "spectral_centroid": 29,
+        "pitch": 22
+      }
+    },
+    {
+      "word": "of",
+      "duration": 0.15,
+      "c1": [443, 328, 528, 85, 313, 145, 588, 140, 114, 325, 325],
+      "c2": [924, 835, 400, 832, 397, 1011, 695, 716, 366, 489, 487],
+      "features": {
+        "energy": 7,
+        "spectral_centroid": 34,
+        "pitch": 13
+      }
+    },
+    {
+      "word": "winter",
+      "duration": 0.29,
+      "c1": [
+        559, 71, 549, 64, 902, 609, 206, 386, 428, 529, 92, 1020, 148, 456, 605,
+        673, 958, 897, 250, 716, 236, 232
+      ],
+      "c2": [
+        891, 358, 1016, 185, 558, 392, 63, 45, 238, 404, 603, 520, 657, 628,
+        748, 649, 629, 298, 772, 483, 1008, 401
+      ],
+      "features": {
+        "energy": 18,
+        "spectral_centroid": 16,
+        "pitch": 31
+      }
+    },
+    {
+      "word": "began",
+      "duration": 0.24,
+      "c1": [
+        490, 6, 596, 669, 1011, 700, 583, 349, 666, 783, 215, 126, 61, 22, 945,
+        773, 920, 975
+      ],
+      "c2": [
+        194, 225, 140, 243, 14, 650, 929, 671, 323, 365, 556, 298, 707, 483,
+        550, 57, 127, 886
+      ],
+      "features": {
+        "energy": 11,
+        "spectral_centroid": 12,
+        "pitch": 18
+      }
+    },
+    {
+      "word": "to",
+      "duration": 0.2,
+      "c1": [
+        265, 1021, 113, 178, 698, 561, 97, 402, 25, 916, 766, 660, 159, 945, 967
+      ],
+      "c2": [
+        141, 976, 455, 403, 760, 738, 519, 123, 327, 721, 690, 904, 689, 140,
+        615
+      ],
+      "features": {
+        "energy": 13,
+        "spectral_centroid": 19,
+        "pitch": 20
+      }
+    },
+    {
+      "word": "fall,",
+      "duration": 0.39,
+      "c1": [
+        781, 325, 4, 114, 997, 415, 4, 443, 953, 781, 399, 993, 489, 383, 920,
+        383, 272, 755, 843, 450, 763, 392, 411, 682, 895, 443, 490, 863, 79
+      ],
+      "c2": [
+        143, 990, 209, 990, 990, 556, 462, 952, 914, 702, 301, 833, 779, 982,
+        26, 458, 519, 9, 264, 74, 304, 110, 646, 905, 185, 959, 53, 543, 909
+      ],
+      "features": {
+        "energy": 13,
+        "spectral_centroid": 14,
+        "pitch": 18
+      }
+    },
+    {
+      "word": "dusting",
+      "duration": 0.89,
+      "c1": [
+        27, 669, 490, 691, 691, 625, 625, 572, 474, 885, 215, 215, 215, 215,
+        215, 215, 75, 718, 94, 924, 232, 818, 14, 232, 985, 547, 955, 4, 627,
+        524, 524, 579, 462, 104, 597, 720, 720, 491, 597, 571, 802, 864, 315,
+        515, 832, 219, 133, 923, 773, 245, 415, 328, 590, 80, 528, 322, 808,
+        551, 625, 716, 158, 562, 712, 477, 905, 920, 424
+      ],
+      "c2": [
+        206, 521, 77, 447, 260, 810, 74, 301, 243, 775, 243, 775, 880, 862,
+        1017, 806, 806, 631, 873, 806, 806, 722, 14, 531, 630, 500, 990, 240,
+        690, 431, 240, 815, 449, 273, 903, 569, 325, 629, 872, 239, 686, 189,
+        774, 264, 314, 628, 107, 120, 560, 929, 1008, 610, 24, 929, 400, 949,
+        431, 721, 447, 443, 774, 392, 923, 855, 747, 144, 460
+      ],
+      "features": {
+        "energy": 14,
+        "spectral_centroid": 28,
+        "pitch": 30
+      }
+    },
+    {
+      "word": "the",
+      "duration": 0.12,
+      "c1": [396, 433, 276, 530, 316, 117, 112, 7, 531],
+      "c2": [332, 479, 262, 239, 123, 239, 453, 499, 545],
+      "features": {
+        "energy": 23,
+        "spectral_centroid": 11,
+        "pitch": 30
+      }
+    },
+    {
+      "word": "world",
+      "duration": 0.32,
+      "c1": [
+        217, 489, 897, 607, 402, 383, 496, 937, 247, 206, 790, 32, 406, 856,
+        715, 458, 278, 481, 503, 399, 871, 453, 858, 392
+      ],
+      "c2": [
+        593, 959, 461, 546, 242, 438, 81, 99, 939, 361, 269, 571, 525, 542, 246,
+        10, 613, 228, 913, 252, 132, 132, 287, 559
+      ],
+      "features": {
+        "energy": 22,
+        "spectral_centroid": 11,
+        "pitch": 31
+      }
+    },
+    {
+      "word": "in",
+      "duration": 0.23,
+      "c1": [
+        558, 497, 436, 598, 607, 416, 311, 906, 955, 905, 448, 54, 92, 487, 770,
+        298, 490
+      ],
+      "c2": [
+        838, 399, 420, 819, 325, 929, 124, 214, 1021, 728, 975, 688, 132, 718,
+        724, 911, 536
+      ],
+      "features": {
+        "energy": 14,
+        "spectral_centroid": 16,
+        "pitch": 22
+      }
+    },
+    {
+      "word": "fragile",
+      "duration": 0.41,
+      "c1": [
+        415, 325, 953, 359, 325, 838, 359, 764, 842, 341, 706, 674, 971, 592,
+        507, 16, 628, 481, 626, 691, 1011, 610, 336, 476, 528, 637, 472, 251,
+        945, 811, 406
+      ],
+      "c2": [
+        126, 990, 374, 143, 629, 868, 338, 91, 346, 393, 407, 987, 987, 1009,
+        617, 854, 824, 439, 789, 311, 810, 497, 664, 549, 135, 908, 702, 639,
+        320, 698, 414
+      ],
+      "features": {
+        "energy": 13,
+        "spectral_centroid": 20,
+        "pitch": 18
+      }
+    },
+    {
+      "word": "white.",
+      "duration": 0.75,
+      "c1": [
+        26, 432, 1, 651, 998, 716, 998, 727, 978, 311, 85, 895, 279, 392, 669,
+        916, 549, 1011, 97, 597, 296, 392, 526, 998, 835, 468, 871, 405, 26,
+        759, 524, 107, 77, 22, 260, 682, 621, 79, 682, 411, 701, 972, 691, 720,
+        551, 597, 660, 224, 236, 70, 652, 215, 126, 474, 597, 625
+      ],
+      "c2": [
+        475, 778, 695, 612, 913, 315, 536, 593, 55, 371, 19, 560, 821, 646, 151,
+        801, 821, 413, 14, 922, 629, 380, 417, 679, 487, 562, 821, 706, 324,
+        896, 169, 594, 810, 864, 810, 588, 862, 969, 14, 105, 528, 165, 420,
+        170, 821, 423, 977, 904, 690, 235, 702, 14, 124, 350, 74, 413
+      ],
+      "features": {
+        "energy": 13,
+        "spectral_centroid": 11,
+        "pitch": 23
+      }
+    }
+  ],
+  "global_features": {
+    "energy": 13,
+    "spectral_centroid": 20,
+    "pitch": 28
+  },
+  "interface_version": 3
+}