PyPI - nexaai - Versions diffs - 1.0.29__cp310-cp310-macosx_14_0_universal2.whl - Mend

nexaai 1.0.29__cp310-cp310-macosx_14_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (580) hide show

nexaai/mlx_backend/vlm/modeling/models/deepseek_vl_v2/language.py ADDED Viewed

@@ -0,0 +1,591 @@
+import inspect
+import math
+from dataclasses import dataclass
+from typing import Any, Dict, Optional, Tuple, Union
+import mlx.core as mx
+import mlx.nn as nn
+from mlx_lm.models.switch_layers import SwitchGLU
+from ..base import (
+    LanguageModelOutput,
+    create_attention_mask,
+    scaled_dot_product_attention,
+)
+from ..cache import KVCache, RotatingKVCache
+@dataclass
+class TextConfig:
+    model_type: str = "deepseek_v2"
+    vocab_size: int = 102400
+    hidden_size: int = 1280
+    intermediate_size: int = 6848
+    moe_intermediate_size: int = 896
+    num_hidden_layers: int = 30
+    num_attention_heads: int = 32
+    num_key_value_heads: int = 32
+    n_shared_experts: Optional[int] = 2
+    n_routed_experts: Optional[int] = 64
+    routed_scaling_factor: float = 1.0
+    kv_lora_rank: int = 512
+    q_lora_rank: int = 1536
+    qk_rope_head_dim: int = 64
+    v_head_dim: int = 128
+    qk_nope_head_dim: int = 128
+    topk_method: str = "greedy"
+    n_group: Optional[int] = 1
+    topk_group: Optional[int] = 1
+    num_experts_per_tok: Optional[int] = 6
+    moe_layer_freq: int = 1
+    first_k_dense_replace: int = 0
+    max_position_embeddings: int = 2048
+    rms_norm_eps: float = 1e-6
+    rope_theta: float = 10000.0
+    rope_traditional: bool = True
+    rope_scaling: Dict = None
+    attention_bias: bool = False
+    scoring_func: str = "softmax"
+    attn_type: str = "DeepseekV2Attention"
+    @classmethod
+    def from_dict(cls, params):
+        return cls(
+            **{
+                k: v
+                for k, v in params.items()
+                if k in inspect.signature(cls).parameters
+            }
+        )
+    def __post_init__(self):
+        if self.qk_nope_head_dim == 0:
+            self.attn_type = "LlamaAttention"
+        if self.num_key_value_heads is None:
+            self.num_key_value_heads = self.num_attention_heads
+def yarn_find_correction_dim(
+    num_rotations, dim, base=10000, max_position_embeddings=2048
+):
+    return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (
+        2 * math.log(base)
+    )
+def yarn_find_correction_range(
+    low_rot, high_rot, dim, base=10000, max_position_embeddings=2048
+):
+    low = math.floor(
+        yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings)
+    )
+    high = math.ceil(
+        yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings)
+    )
+    return max(low, 0), min(high, dim - 1)
+def yarn_get_mscale(scale=1, mscale=1):
+    if scale <= 1:
+        return 1.0
+    return 0.1 * mscale * math.log(scale) + 1.0
+def yarn_linear_ramp_mask(min_val, max_val, dim):
+    if min_val == max_val:
+        max_val += 0.001  # Prevent singularity
+    linear_func = (mx.arange(dim, dtype=mx.float32) - min_val) / (max_val - min_val)
+    return mx.clip(linear_func, 0, 1)
+class DeepseekV2YarnRotaryEmbedding(nn.Module):
+    def __init__(
+        self,
+        dim,
+        max_position_embeddings=2048,
+        base=10000,
+        scaling_factor=1.0,
+        original_max_position_embeddings=4096,
+        beta_fast=32,
+        beta_slow=1,
+        mscale=1,
+        mscale_all_dim=0,
+    ):
+        super().__init__()
+        self.mscale = yarn_get_mscale(scaling_factor, mscale) / yarn_get_mscale(
+            scaling_factor, mscale_all_dim
+        )
+        freq_extra = base ** (mx.arange(0, dim, 2, dtype=mx.float32) / dim)
+        freq_inter = scaling_factor * base ** (
+            mx.arange(0, dim, 2, dtype=mx.float32) / dim
+        )
+        low, high = yarn_find_correction_range(
+            beta_fast,
+            beta_slow,
+            dim,
+            base,
+            original_max_position_embeddings,
+        )
+        freq_mask = 1.0 - yarn_linear_ramp_mask(low, high, dim // 2)
+        self._freqs = (freq_inter * freq_extra) / (
+            freq_inter * freq_mask + freq_extra * (1 - freq_mask)
+        )
+    def __call__(self, x, offset=0):
+        if self.mscale != 1.0:
+            x = self.mscale * x
+        return mx.fast.rope(
+            x,
+            x.shape[-1],
+            traditional=True,
+            base=None,
+            scale=1.0,
+            offset=offset,
+            freqs=self._freqs,
+        )
+class DeepseekV2Attention(nn.Module):
+    def __init__(self, config: TextConfig):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.q_lora_rank = config.q_lora_rank
+        self.qk_rope_head_dim = config.qk_rope_head_dim
+        self.kv_lora_rank = config.kv_lora_rank
+        self.v_head_dim = config.v_head_dim
+        self.qk_nope_head_dim = config.qk_nope_head_dim
+        self.q_head_dim = config.qk_nope_head_dim + config.qk_rope_head_dim
+        self.scale = self.q_head_dim**-0.5
+        if self.q_lora_rank is None:
+            self.q_proj = nn.Linear(
+                self.hidden_size, self.num_heads * self.q_head_dim, bias=False
+            )
+        else:
+            self.q_a_proj = nn.Linear(
+                self.hidden_size, self.q_lora_rank, bias=config.attention_bias
+            )
+            self.q_a_layernorm = nn.RMSNorm(self.q_lora_rank)
+            self.q_b_proj = nn.Linear(
+                self.q_lora_rank, self.num_heads * self.q_head_dim, bias=False
+            )
+        self.kv_a_proj_with_mqa = nn.Linear(
+            self.hidden_size,
+            self.kv_lora_rank + self.qk_rope_head_dim,
+            bias=config.attention_bias,
+        )
+        self.kv_a_layernorm = nn.RMSNorm(self.kv_lora_rank)
+        self.kv_b_proj = nn.Linear(
+            self.kv_lora_rank,
+            self.num_heads
+            * (self.q_head_dim - self.qk_rope_head_dim + self.v_head_dim),
+            bias=False,
+        )
+        self.o_proj = nn.Linear(
+            self.num_heads * self.v_head_dim,
+            self.hidden_size,
+            bias=config.attention_bias,
+        )
+        if self.config.rope_scaling is None:
+            self.rope = nn.RoPE(
+                self.qk_rope_head_dim,
+                traditional=self.config.rope_traditional,
+                base=self.rope_theta,
+            )
+        else:
+            mscale_all_dim = self.config.rope_scaling.get("mscale_all_dim", 0)
+            scaling_factor = self.config.rope_scaling.get("factor", 1)
+            if mscale_all_dim:
+                mscale = yarn_get_mscale(scaling_factor, mscale_all_dim)
+                self.scale = self.scale * mscale * mscale
+                rope_kwargs = {
+                    key: self.config.rope_scaling[key]
+                    for key in [
+                        "original_max_position_embeddings",
+                        "beta_fast",
+                        "beta_slow",
+                        "mscale",
+                        "mscale_all_dim",
+                    ]
+                    if key in self.config.rope_scaling
+                }
+                self.rope = DeepseekV2YarnRotaryEmbedding(
+                    dim=self.qk_rope_head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                    **rope_kwargs,
+                )
+    def __call__(
+        self,
+        x: mx.array,
+        mask: Optional[mx.array] = None,
+        cache: Optional[Any] = None,
+    ) -> mx.array:
+        B, L, D = x.shape
+        if self.q_lora_rank is None:
+            q = self.q_proj(x)
+        else:
+            q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(x)))
+        q = q.reshape(B, L, self.num_heads, self.q_head_dim).transpose(0, 2, 1, 3)
+        q_nope, q_pe = mx.split(q, [self.qk_nope_head_dim], axis=-1)
+        compressed_kv = self.kv_a_proj_with_mqa(x)
+        compressed_kv, k_pe = mx.split(compressed_kv, [self.kv_lora_rank], axis=-1)
+        k_pe = k_pe.reshape(B, L, 1, self.qk_rope_head_dim).transpose(0, 2, 1, 3)
+        kv = self.kv_b_proj(self.kv_a_layernorm(compressed_kv))
+        kv = kv.reshape(B, L, self.num_heads, -1).transpose(0, 2, 1, 3)
+        k_nope, values = mx.split(kv, [self.qk_nope_head_dim], axis=-1)
+        if cache is not None:
+            q_pe = self.rope(q_pe, cache.offset)
+            k_pe = self.rope(k_pe, cache.offset)
+            k_pe = mx.repeat(k_pe, self.num_heads, axis=1)
+            keys, values = cache.update_and_fetch(
+                mx.concatenate([k_nope, k_pe], axis=-1), values
+            )
+        else:
+            q_pe = self.rope(q_pe)
+            k_pe = self.rope(k_pe)
+            k_pe = mx.repeat(k_pe, self.num_heads, axis=1)
+            keys = mx.concatenate([k_nope, k_pe], axis=-1)
+        queries = mx.concatenate([q_nope, q_pe], axis=-1)
+        output = scaled_dot_product_attention(
+            queries, keys, values, cache, scale=self.scale, mask=mask
+        )
+        output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)
+        return self.o_proj(output)
+class LlamaAttention(nn.Module):
+    def __init__(self, config: TextConfig):
+        super().__init__()
+        dim = config.hidden_size
+        self.n_heads = n_heads = config.num_attention_heads
+        self.n_kv_heads = n_kv_heads = config.num_key_value_heads
+        self.head_dim = head_dim = config.hidden_size // n_heads
+        self.scale = head_dim**-0.5
+        if config.attention_bias:
+            attention_bias = config.attention_bias
+        else:
+            attention_bias = False
+        self.q_proj = nn.Linear(dim, n_heads * head_dim, bias=attention_bias)
+        self.k_proj = nn.Linear(dim, n_kv_heads * head_dim, bias=attention_bias)
+        self.v_proj = nn.Linear(dim, n_kv_heads * head_dim, bias=attention_bias)
+        self.o_proj = nn.Linear(n_heads * head_dim, dim, bias=attention_bias)
+        rope_scale = (
+            1 / config.rope_scaling["factor"]
+            if config.rope_scaling is not None
+            and config.rope_scaling["type"] == "linear"
+            else 1
+        )
+        self.rope = nn.RoPE(
+            head_dim,
+            traditional=config.rope_traditional,
+            base=config.rope_theta,
+            scale=rope_scale,
+        )
+    def __call__(
+        self,
+        x: mx.array,
+        mask: Optional[mx.array] = None,
+        cache: Optional[Any] = None,
+    ) -> mx.array:
+        B, L, D = x.shape
+        queries, keys, values = self.q_proj(x), self.k_proj(x), self.v_proj(x)
+        # Prepare the queries, keys and values for the attention computation
+        queries = queries.reshape(B, L, self.n_heads, -1).transpose(0, 2, 1, 3)
+        keys = keys.reshape(B, L, self.n_kv_heads, -1).transpose(0, 2, 1, 3)
+        values = values.reshape(B, L, self.n_kv_heads, -1).transpose(0, 2, 1, 3)
+        if cache is not None:
+            queries = self.rope(queries, offset=cache.offset)
+            keys = self.rope(keys, offset=cache.offset)
+            keys, values = cache.update_and_fetch(keys, values)
+        else:
+            queries = self.rope(queries)
+            keys = self.rope(keys)
+        output = scaled_dot_product_attention(
+            queries, keys, values, cache, scale=self.scale, mask=mask
+        )
+        output = output.transpose(0, 2, 1, 3).reshape(B, L, -1)
+        return self.o_proj(output)
+class DeepseekV2MLP(nn.Module):
+    def __init__(
+        self, config: TextConfig, hidden_size: int = None, intermediate_size: int = None
+    ):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size if hidden_size is None else hidden_size
+        self.intermediate_size = (
+            config.intermediate_size if intermediate_size is None else intermediate_size
+        )
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+    def __call__(self, x):
+        down_proj = self.down_proj(nn.silu(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+class MoEGate(nn.Module):
+    def __init__(self, config: TextConfig):
+        super().__init__()
+        self.config = config
+        self.scoring_func = config.scoring_func
+        self.top_k = config.num_experts_per_tok
+        self.n_routed_experts = config.n_routed_experts
+        self.routed_scaling_factor = config.routed_scaling_factor
+        self.topk_method = config.topk_method
+        self.n_group = config.n_group
+        self.topk_group = config.topk_group
+        if self.topk_method == "noaux_tc":
+            self.e_score_correction_bias = mx.zeros((self.n_routed_experts))
+        self.weight = mx.zeros((self.n_routed_experts, config.hidden_size))
+    def __call__(self, x):
+        gates = x @ self.weight.T
+        if self.scoring_func == "softmax":
+            scores = mx.softmax(gates, axis=-1, precise=True)
+        elif self.scoring_func == "sigmoid":
+            scores = mx.sigmoid(gates)
+        else:
+            raise ValueError(f"Unknown scoring function: {self.scoring_func}")
+        if self.topk_method == "greedy":
+            bsz, seq_len = x.shape[:2]
+            scores = scores.reshape(bsz, seq_len, self.n_group, -1)
+            group_scores = scores.max(axis=-1)
+            # Get top-k groups
+            k = self.n_group - self.topk_group
+            group_idx = mx.argpartition(group_scores, kth=k - 1, axis=-1)[..., :k]
+            batch_idx = mx.expand_dims(mx.arange(bsz), (1, 2))
+            seq_idx = mx.expand_dims(mx.arange(seq_len), (0, 2))
+            # Mask out top-k groups
+            scores[batch_idx, seq_idx, group_idx] = 0.0
+            scores = scores.reshape(bsz, seq_len, -1)
+            # Get top-k indices and weights
+            k = self.top_k
+            inds = mx.argpartition(-scores, kth=k - 1, axis=-1)[..., :k]
+            scores = mx.take_along_axis(scores, inds, axis=-1)
+        elif self.topk_method == "noaux_tc":
+            bsz, seq_len = x.shape[:2]
+            # Add bias correction
+            scores_for_choice = scores.reshape(bsz * seq_len, -1) + mx.expand_dims(
+                self.e_score_correction_bias, 0
+            )
+            # Calculate group scores using top-2 sum per group
+            scores_reshaped = scores_for_choice.reshape(bsz * seq_len, self.n_group, -1)
+            # Get top 2 scores per group
+            group_scores = mx.topk(scores_reshaped, 2, axis=-1).sum(axis=-1)
+            # Get top groups
+            k = self.n_group - self.topk_group
+            # Create mask for selected groups
+            group_idx = mx.argpartition(group_scores, kth=k - 1, axis=-1)[..., :k]
+            batch_idx = mx.expand_dims(mx.arange(bsz), (1, 2))
+            seq_idx = mx.expand_dims(mx.arange(seq_len), (0, 2))
+            scores[batch_idx, seq_idx, group_idx] = 0.0
+            # Get top-k indices and weights
+            k = self.top_k
+            inds = mx.argpartition(scores, kth=-k, axis=-1)[..., -k:]
+            # Gather original scores for the selected indices
+            scores_flat = scores.reshape(bsz * seq_len, -1)
+            batch_idx = mx.expand_dims(mx.arange(bsz * seq_len), 1)
+            scores = mx.take(scores_flat, inds + batch_idx * scores_flat.shape[1])
+        else:
+            raise ValueError(f"Unknown topk method: {self.topk_method}")
+        scores = scores * self.routed_scaling_factor
+        return inds, scores
+class DeepseekV2MoE(nn.Module):
+    def __init__(self, config: TextConfig):
+        super().__init__()
+        self.config = config
+        self.num_experts_per_tok = config.num_experts_per_tok
+        self.switch_mlp = SwitchGLU(
+            config.hidden_size, config.moe_intermediate_size, config.n_routed_experts
+        )
+        self.gate = MoEGate(config)
+        if config.n_shared_experts is not None:
+            intermediate_size = config.moe_intermediate_size * config.n_shared_experts
+            self.shared_experts = DeepseekV2MLP(
+                config=config, intermediate_size=intermediate_size
+            )
+    def __call__(self, x):
+        inds, scores = self.gate(x)
+        y = self.switch_mlp(x, inds)
+        y = (y * scores[..., None]).sum(axis=-2)
+        if self.config.n_shared_experts is not None:
+            y = y + self.shared_experts(x)
+        return y
+class DeepseekV2DecoderLayer(nn.Module):
+    def __init__(self, config: TextConfig, layer_idx: int):
+        super().__init__()
+        self.attn_type = config.attn_type
+        self.self_attn = (
+            DeepseekV2Attention(config)
+            if self.attn_type == "DeepseekV2Attention"
+            else LlamaAttention(config)
+        )
+        self.mlp = (
+            DeepseekV2MoE(config)
+            if (
+                config.n_routed_experts is not None
+                and layer_idx >= config.first_k_dense_replace
+                and layer_idx % config.moe_layer_freq == 0
+            )
+            else DeepseekV2MLP(config)
+        )
+        self.input_layernorm = nn.RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = nn.RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+    def __call__(
+        self,
+        x: mx.array,
+        mask: Optional[mx.array] = None,
+        cache: Optional[Any] = None,
+    ) -> mx.array:
+        r = self.self_attn(self.input_layernorm(x), mask, cache)
+        h = x + r
+        r = self.mlp(self.post_attention_layernorm(h))
+        out = h + r
+        return out
+class DeepseekV2Model(nn.Module):
+    def __init__(self, config: TextConfig):
+        super().__init__()
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.layers = [
+            DeepseekV2DecoderLayer(config, idx)
+            for idx in range(config.num_hidden_layers)
+        ]
+        self.norm = nn.RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def __call__(
+        self,
+        x: mx.array,
+        mask: Optional[mx.array] = None,
+        inputs_embeds: Optional[mx.array] = None,
+        cache: Optional[Any] = None,
+    ) -> mx.array:
+        if inputs_embeds is None:
+            h = self.embed_tokens(x)
+        else:
+            h = inputs_embeds
+        if cache is None:
+            cache = [None] * len(self.layers)
+        if mask is None:
+            mask = create_attention_mask(h, cache)
+        for layer, c in zip(self.layers, cache):
+            h = layer(h, mask, c)
+        return self.norm(h)
+class LanguageModel(nn.Module):
+    def __init__(self, config: TextConfig):
+        super().__init__()
+        self.config = config
+        self.model_type = config.model_type
+        self.model = DeepseekV2Model(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+    def __call__(
+        self,
+        inputs: mx.array,
+        inputs_embeds: Optional[mx.array] = None,
+        mask: Optional[mx.array] = None,
+        cache: Optional[Any] = None,
+    ):
+        out = self.model(inputs, mask=mask, inputs_embeds=inputs_embeds, cache=cache)
+        out = self.lm_head(out)
+        return LanguageModelOutput(logits=out)
+    def sanitize(self, weights):
+        for l in range(self.config.num_hidden_layers):
+            prefix = f"language_model.model.layers.{l}"
+            for n, m in [("w1", "gate_proj"), ("w2", "down_proj"), ("w3", "up_proj")]:
+                for k in ["weight", "scales", "biases"]:
+                    if f"{prefix}.mlp.experts.0.{m}.{k}" in weights:
+                        to_join = [
+                            weights.pop(f"{prefix}.mlp.experts.{e}.{m}.{k}")
+                            for e in range(self.config.n_routed_experts)
+                        ]
+                        weights[f"{prefix}.mlp.switch_mlp.{m}.{k}"] = mx.stack(to_join)
+        return weights
+    @property
+    def layers(self):
+        return self.model.layers
+    @property
+    def head_dim(self):
+        if self.config.attn_type == "DeepseekV2Attention":
+            return (
+                self.config.qk_nope_head_dim + self.config.qk_rope_head_dim,
+                self.config.v_head_dim,
+            )
+        else:
+            return self.config.hidden_size // self.config.num_key_value_heads
+    @property
+    def n_kv_heads(self):
+        return self.config.num_key_value_heads