PyPI - nexaai - Versions diffs - 1.0.29__cp310-cp310-macosx_14_0_universal2.whl - Mend

nexaai 1.0.29__cp310-cp310-macosx_14_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (580) hide show

nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/normalize.py ADDED Viewed

@@ -0,0 +1,294 @@
+import re
+from typing import Dict, List, Tuple
+CHAR_MAP = {
+    "：": ",",
+    "；": ",",
+    ";": ",",
+    "，": ",",
+    "。": ".",
+    "！": "!",
+    "？": "?",
+    "\n": " ",
+    "·": "-",
+    "、": ",",
+    "...": "…",
+    ",,,": "…",
+    "，，，": "…",
+    "……": "…",
+    """: "'", """: "'",
+    '"': "'",
+    "'": "'",
+    "（": "'",
+    "）": "'",
+    "(": "'",
+    ")": "'",
+    "《": "'",
+    "》": "'",
+    "【": "'",
+    "】": "'",
+    "[": "'",
+    "]": "'",
+    "—": "-",
+    "～": "-",
+    "~": "-",
+    "「": "'",
+    "」": "'",
+    ":": ",",
+}
+ZH_CHAR_MAP = {"$": ".", **CHAR_MAP}
+PINYIN_PATTERN = r"(?<![a-z])((?:[bpmfdtnlgkhjqxzcsryw]|[zcs]h)?(?:[aeiouüv]|[ae]i|u[aio]|ao|ou|i[aue]|[uüv]e|[uvü]ang?|uai|[aeiuv]n|[aeio]ng|ia[no]|i[ao]ng)|ng|er)([1-5])"
+NAME_PATTERN = r"[\u4e00-\u9fff]+(?:[-·—][\u4e00-\u9fff]+){1,2}"
+CONTRACTION_PATTERN = r"(what|where|who|which|how|t?here|it|s?he|that|this)'s"
+EMAIL_PATTERN = r"^[a-zA-Z0-9]+@[a-zA-Z0-9]+\.[a-zA-Z]+$"
+def is_email(text: str) -> bool:
+    return bool(re.match(EMAIL_PATTERN, text))
+def has_chinese(text: str) -> bool:
+    return bool(re.search(r"[\u4e00-\u9fff]", text))
+def has_alpha(text: str) -> bool:
+    return bool(re.search(r"[a-zA-Z]", text))
+def has_pinyin(text: str) -> bool:
+    return bool(re.search(PINYIN_PATTERN, text, re.IGNORECASE))
+def use_chinese(text: str) -> bool:
+    return (
+        has_chinese(text) or not has_alpha(text) or is_email(text) or has_pinyin(text)
+    )
+def replace_chars(text: str, char_map: Dict[str, str]) -> str:
+    pattern = re.compile("|".join(re.escape(p) for p in char_map.keys()))
+    return pattern.sub(lambda x: char_map[x.group()], text)
+def extract_all_digits(text):
+    return "".join(filter(str.isdigit, text))
+def expand_contractions(text: str) -> str:
+    return re.sub(CONTRACTION_PATTERN, r"\1 is", text, flags=re.IGNORECASE)
+def correct_pinyin(pinyin: str) -> str:
+    if pinyin[0] not in "jqxJQX":
+        return pinyin
+    return re.sub(
+        r"([jqx])[uü](n|e|an)*(\d)", r"\g<1>v\g<2>\g<3>", pinyin, flags=re.IGNORECASE
+    ).upper()
+def extract_patterns(text: str, pattern: str) -> List[str]:
+    matches = re.findall(re.compile(pattern, re.IGNORECASE), text)
+    return list(set("".join(m) for m in matches))
+def create_placeholders(items: List[str], prefix: str) -> Dict[str, str]:
+    return {item: f"<{prefix}_{chr(ord('a') + i)}>" for i, item in enumerate(items)}
+def apply_placeholders(text: str, placeholders: Dict[str, str]) -> str:
+    result = text
+    for original, placeholder in placeholders.items():
+        result = result.replace(original, placeholder)
+    return result
+def restore_placeholders(
+    text: str, placeholders: Dict[str, str], transform_fn=None
+) -> str:
+    result = text
+    for original, placeholder in placeholders.items():
+        replacement = transform_fn(original) if transform_fn else original
+        result = result.replace(placeholder, replacement)
+    return result
+def save_and_replace(
+    text: str, pattern: str, prefix: str
+) -> Tuple[str, Dict[str, str]]:
+    items = extract_patterns(text, pattern)
+    if not items:
+        return text, {}
+    placeholders = create_placeholders(items, prefix)
+    return apply_placeholders(text, placeholders), placeholders
+# number normalizers
+def number_to_words(n: int):
+    ones = ["", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
+    teens = [
+        "ten",
+        "eleven",
+        "twelve",
+        "thirteen",
+        "fourteen",
+        "fifteen",
+        "sixteen",
+        "seventeen",
+        "eighteen",
+        "nineteen",
+    ]
+    tens = [
+        "",
+        "",
+        "twenty",
+        "thirty",
+        "forty",
+        "fifty",
+        "sixty",
+        "seventy",
+        "eighty",
+        "ninety",
+    ]
+    thousands = ["", "thousand", "million", "billion", "trillion"]
+    def convert_hundreds(num):
+        if num == 0:
+            return ""
+        elif num < 10:
+            return ones[num]
+        elif num < 20:
+            return teens[num - 10]
+        elif num < 100:
+            return tens[num // 10] + (" " + ones[num % 10] if num % 10 else "")
+        else:
+            return (
+                ones[num // 100]
+                + " hundred"
+                + (" " + convert_hundreds(num % 100) if num % 100 else "")
+            )
+    def convert_number(num: int):
+        if num == 0:
+            return "zero"
+        groups = []
+        group_idx = 0
+        while num > 0:
+            group = num % 1000
+            if group != 0:
+                group_words = convert_hundreds(group)
+                if thousands[group_idx]:
+                    group_words += " " + thousands[group_idx]
+                groups.append(group_words)
+            num //= 1000
+            group_idx += 1
+        return " ".join(reversed(groups))
+    return convert_number(n)
+# @lru_cache(maxsize=1)
+# def get_normalizers():
+#     """Lazy load normalizers"""
+#     from wetext import Normalizer  # type: ignore
+#     return (
+#         Normalizer(remove_erhua=False, lang="zh", operator="tn"),
+#         Normalizer(lang="en", operator="tn"),
+#     )
+def normalize_chinese(text: str) -> str:
+    # zh_normalizer, _ = get_normalizers()
+    text = expand_contractions(text.rstrip())
+    text, pinyin_map = save_and_replace(text, PINYIN_PATTERN, "pinyin")
+    text, name_map = save_and_replace(text, NAME_PATTERN, "n")
+    try:
+        result = text  # TODO: improve Chinese normalizers
+        # result = zh_normalizer.normalize(text)
+    except Exception:
+        return ""
+    result = restore_placeholders(result, name_map)
+    result = restore_placeholders(result, pinyin_map, correct_pinyin)
+    result = replace_chars(result, ZH_CHAR_MAP)
+    return result
+def normalize_english(text: str) -> str:
+    # _, en_normalizer = get_normalizers()
+    text = expand_contractions(text)
+    try:
+        # currently dollar only
+        def process_currency(match):
+            digits = extract_all_digits(match.group(0))
+            if not digits:
+                return match.group(0)
+            num = int(digits)
+            word_form = number_to_words(num)
+            return f"{word_form} dollar{'s' if num != 1 else ''} "
+        text = re.sub(r"\$\s*[0-9,.\s]+", process_currency, text).rstrip()
+        def process_digits(match):
+            parts = match.group(0).split()
+            if all(len(part) == 1 and part.isdigit() for part in parts):
+                return " ".join(number_to_words(int(digit)) for digit in parts)
+            return number_to_words(int(extract_all_digits(match.group(0))))
+        text = re.sub(r"\b\d(\s+\d)+\b", process_digits, text)
+        def process_number(match):
+            digits = extract_all_digits(match.group(0))
+            if digits:
+                return number_to_words(int(digits))
+            return match.group(0)
+        text = re.sub(r"\b\d+(?:,\d+)*\b", process_number, text)
+        result = re.sub(r"\s+", " ", text).strip()
+    except Exception:
+        result = text
+    return replace_chars(result, CHAR_MAP)
+def normalize(text: str) -> str:
+    normalize_fn = normalize_chinese if use_chinese(text) else normalize_english
+    return normalize_fn(text)
+def tokenize_by_CJK_char(line: str, do_upper_case=True) -> str:
+    """
+    Tokenize a line of text with CJK char.
+    Note: All return charaters will be upper case.
+    Example:
+      input = "你好世界是 hello world 的中文"
+      output = "你 好 世 界 是 HELLO WORLD 的 中 文"
+    Args:
+      line:
+        The input text.
+    Return:
+      A new string tokenize by CJK char.
+    """
+    # The CJK ranges is from https://github.com/alvations/nltk/blob/79eed6ddea0d0a2c212c1060b477fc268fec4d4b/nltk/tokenize/util.py
+    CJK_RANGE_PATTERN = r"([\u1100-\u11ff\u2e80-\ua4cf\ua840-\uD7AF\uF900-\uFAFF\uFE30-\uFE4F\uFF65-\uFFDC\U00020000-\U0002FFFF])"
+    chars = re.split(CJK_RANGE_PATTERN, line.strip())
+    return " ".join(
+        [w.strip().upper() if do_upper_case else w.strip() for w in chars if w.strip()]
+    )

nexaai/binds/metal/py-lib/mlx_audio/tts/models/indextts/perceiver.py ADDED Viewed

@@ -0,0 +1,62 @@
+from typing import Optional
+import mlx.core as mx
+import mlx.nn as nn
+from mlx_audio.tts.models.indextts.attention import MultiHeadAttention
+# gated gelu feedforward
+class FeedForward(nn.Module):
+    def __init__(self, dim: int, d_ff: int, use_bias: bool = True):
+        super().__init__()
+        self.w_1 = nn.Linear(dim, d_ff * 2, bias=use_bias)
+        self.activation = nn.GELU()
+        self.w_2 = nn.Linear(d_ff, dim, bias=use_bias)
+    def __call__(self, x: mx.array) -> mx.array:
+        x, gate = mx.split(self.w_1(x), 2, axis=-1)
+        return self.w_2(self.activation(gate) * x)
+class PerceiverResampler(nn.Module):
+    def __init__(
+        self,
+        n_dim: int,
+        n_depth=2,
+        n_dim_context: Optional[int] = None,
+        n_latents=32,
+        n_dim_head=64,
+        n_heads=8,
+        n_ff_mult=4,
+    ):
+        super().__init__()
+        n_dim_context = n_dim if n_dim_context is None else n_dim_context
+        self.proj_context = (
+            nn.Linear(n_dim_context, n_dim) if n_dim_context != n_dim else nn.Identity()
+        )
+        self.latents = mx.zeros((n_latents, n_dim))
+        self.layers = [
+            [
+                MultiHeadAttention(n_heads, n_dim, False, n_dim_head),
+                FeedForward(n_dim, (n_dim * n_ff_mult * 2) // 3),
+            ]
+            for _ in range(n_depth)
+        ]
+        self.norm = nn.RMSNorm(n_dim)
+    def __call__(self, x, mask=None):
+        B = x.shape[0]
+        latents = mx.broadcast_to(self.latents, (B, *self.latents.shape))
+        x = self.proj_context(x)
+        for attn, ff in self.layers:
+            kv = mx.concat([x, latents], axis=-2)
+            latents += attn(latents, kv, kv, mask=mask)
+            latents += ff(latents)
+        return self.norm(latents)

nexaai/binds/metal/py-lib/mlx_audio/tts/models/interpolate.py ADDED Viewed

@@ -0,0 +1,108 @@
+from typing import List, Optional, Tuple, Union
+import mlx.core as mx
+def interpolate(
+    input: mx.array,
+    size: Optional[Union[int, Tuple[int, ...], List[int]]] = None,
+    scale_factor: Optional[Union[float, List[float], Tuple[float, ...]]] = None,
+    mode: str = "nearest",
+    align_corners: Optional[bool] = None,
+) -> mx.array:
+    """Interpolate array with correct shape handling.
+    Args:
+        input (mx.array): Input tensor [N, C, ...] where ... represents spatial dimensions
+        size (int or tuple): Output size
+        scale_factor (float or tuple): Multiplier for spatial size
+        mode (str): 'nearest' or 'linear'
+        align_corners (bool): If True, align corners of input and output tensors
+    """
+    ndim = input.ndim
+    if ndim < 3:
+        raise ValueError(f"Expected at least 3D input (N, C, D1), got {ndim}D")
+    spatial_dims = ndim - 2
+    # Handle size and scale_factor
+    if size is not None and scale_factor is not None:
+        raise ValueError("Only one of size or scale_factor should be defined")
+    elif size is None and scale_factor is None:
+        raise ValueError("One of size or scale_factor must be defined")
+    # Convert single values to tuples
+    if size is not None and not isinstance(size, (list, tuple)):
+        size = [size] * spatial_dims
+    if scale_factor is not None and not isinstance(scale_factor, (list, tuple)):
+        scale_factor = [scale_factor] * spatial_dims
+    # Calculate output size from scale factor if needed
+    if size is None:
+        size = []
+        for i in range(spatial_dims):
+            # Use ceiling instead of floor to match PyTorch behavior
+            curr_size = max(1, int(mx.ceil(input.shape[i + 2] * scale_factor[i])))
+            size.append(curr_size)
+    # Handle 1D case (N, C, W)
+    if spatial_dims == 1:
+        return interpolate1d(input, size[0], mode, align_corners)
+    else:
+        raise ValueError(
+            f"Only 1D interpolation currently supported, got {spatial_dims}D"
+        )
+def interpolate1d(
+    input: mx.array,
+    size: int,
+    mode: str = "linear",
+    align_corners: Optional[bool] = None,
+) -> mx.array:
+    """1D interpolation implementation."""
+    batch_size, channels, in_width = input.shape
+    # Handle edge cases
+    if size < 1:
+        size = 1
+    if in_width < 1:
+        in_width = 1
+    if mode == "nearest":
+        if size == 1:
+            indices = mx.array([0])
+        else:
+            scale = in_width / size
+            indices = mx.floor(mx.arange(size) * scale).astype(mx.int32)
+            indices = mx.clip(indices, 0, in_width - 1)
+        return input[:, :, indices]
+    # Linear interpolation
+    if align_corners and size > 1:
+        x = mx.arange(size) * ((in_width - 1) / (size - 1))
+    else:
+        if size == 1:
+            x = mx.array([0.0])
+        else:
+            x = mx.arange(size) * (in_width / size)
+            if not align_corners:
+                x = x + 0.5 * (in_width / size) - 0.5
+    # Handle the case where input width is 1
+    if in_width == 1:
+        output = mx.broadcast_to(input, (batch_size, channels, size))
+        return output
+    x_low = mx.floor(x).astype(mx.int32)
+    x_high = mx.minimum(x_low + 1, in_width - 1)
+    x_frac = x - x_low
+    # Pre-compute indices to avoid repeated computation
+    y_low = input[:, :, x_low]
+    y_high = input[:, :, x_high]
+    # Vectorized interpolation
+    output = y_low * (1 - x_frac)[None, None, :] + y_high * x_frac[None, None, :]
+    return output

nexaai/binds/metal/py-lib/mlx_audio/tts/models/kokoro/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from .kokoro import Model, ModelConfig
+from .pipeline import KokoroPipeline
+__all__ = ["KokoroPipeline", "Model", "ModelConfig"]