PyPI - xinference - Versions diffs - 0.16.2__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

xinference 0.16.2py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of xinference might be problematic. Click here for more details.

Files changed (60) hide show

xinference/thirdparty/fish_speech/fish_speech/conversation.py CHANGED Viewed

@@ -1,2 +1,256 @@
+from dataclasses import dataclass, field
+from typing import Literal
+import torch
+from transformers import AutoTokenizer, PretrainedConfig, PreTrainedTokenizerFast
+IM_START_TOKEN = "<|im_start|>"
+IM_END_TOKEN = "<|im_end|>"
 SEMANTIC_TOKEN = "<|semantic|>"
+MEL_TOKEN = "<|mel|>"
+PHONEME_START_TOKEN = "<|phoneme_start|>"
+PHONEME_END_TOKEN = "<|phoneme_end|>"
+ALL_SPECIAL_TOKENS = [
+    IM_START_TOKEN,
+    IM_END_TOKEN,
+    SEMANTIC_TOKEN,
+    MEL_TOKEN,
+    PHONEME_START_TOKEN,
+    PHONEME_END_TOKEN,
+]
 CODEBOOK_PAD_TOKEN_ID = 0
+class FishTokenizerConfig(PretrainedConfig):
+    share_codebook_embeddings: bool = True
+    codebook_size: int = 1024
+    num_codebooks: int = 8
+class FishTokenizerFast(PreTrainedTokenizerFast):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.share_codebook_embeddings = kwargs.pop("share_codebook_embeddings", True)
+        self.codebook_size = kwargs.pop("codebook_size", 1024)
+        self.num_codebooks = kwargs.pop("num_codebooks", 8)
+AutoTokenizer.register(FishTokenizerConfig, fast_tokenizer_class=FishTokenizerFast)
+@dataclass(kw_only=True)
+class BasePart:
+    pass
+@dataclass(kw_only=True)
+class VQPart(BasePart):
+    codes: torch.Tensor
+@dataclass(kw_only=True)
+class TextPart(BasePart):
+    text: str
+@dataclass(kw_only=True)
+class MelPart(BasePart):
+    mels: torch.Tensor
+@dataclass(kw_only=True)
+class EncodedMessage:
+    tokens: torch.Tensor
+    labels: torch.Tensor
+    vq_parts: list[torch.Tensor]
+    mel_parts: list[torch.Tensor]
+    vq_require_losses: torch.Tensor | None = None
+@dataclass(kw_only=True)
+class Message:
+    role: Literal["system", "user", "assistant"]
+    parts: list[VQPart | TextPart | MelPart] = field(default_factory=list)
+    add_im_start: bool = True
+    add_im_end: bool = True
+    cal_loss: bool = False
+    # By default, ignore the loss of the auto-generated im_start token
+    ignore_im_start_loss: bool = True
+    def encode(
+        self: "Message",
+        tokenizer: AutoTokenizer,
+    ) -> EncodedMessage:
+        all_tokens = []
+        all_labels = []
+        # Multi-modal tokens
+        vq_parts = []
+        mel_parts = []
+        semantic_id, mel_id = tokenizer.convert_tokens_to_ids(
+            [SEMANTIC_TOKEN, MEL_TOKEN]
+        )
+        parts = self.parts.copy()
+        if self.add_im_start:
+            parts.insert(0, TextPart(text=f"<|im_start|>{self.role}\n"))
+        if self.add_im_end:
+            parts.append(TextPart(text="<|im_end|>"))
+        for part in parts:
+            if isinstance(part, TextPart):
+                tokens = tokenizer.encode(
+                    part.text,
+                    add_special_tokens=False,
+                    truncation=False,
+                    return_tensors="pt",
+                ).int()[0]
+            elif isinstance(part, VQPart):
+                tokens = torch.zeros(part.codes.shape[1], dtype=torch.int) + semantic_id
+                codes = part.codes.clone() + 1
+                if getattr(tokenizer, "share_codebook_embeddings", True) is False:
+                    for i in range(len(codes)):
+                        codes[i] += tokenizer.codebook_size * i
+                vq_parts.append(codes)
+            elif isinstance(part, MelPart):
+                tokens = torch.zeros(part.mels.shape[1], dtype=torch.int) + mel_id
+                mel_parts.append(part.mels)
+            else:
+                raise ValueError(f"Unsupported part type: {type(part)}")
+            all_tokens.append(tokens)
+            if self.cal_loss:
+                all_labels.append(tokens.clone())
+            else:
+                all_labels.append(torch.full_like(tokens, -100))
+        tokens = torch.cat(all_tokens, dim=0)
+        labels = torch.cat(all_labels, dim=0)
+        assert tokens.shape == labels.shape
+        if self.ignore_im_start_loss and self.add_im_start:
+            labels[: len(all_tokens[0])] = -100
+        return EncodedMessage(
+            tokens=tokens,
+            labels=labels,
+            vq_parts=vq_parts,
+            mel_parts=mel_parts,
+        )
+@dataclass
+class Conversation:
+    messages: list[Message]
+    def encode(
+        self: "Conversation",
+        tokenizer: AutoTokenizer,
+        add_shift: bool = True,
+    ) -> EncodedMessage:
+        # Build the input_ids and labels
+        tokens = []
+        labels = []
+        vq_parts = []
+        mel_parts = []
+        vq_require_losses = []
+        for message in self.messages:
+            encoded = message.encode(
+                tokenizer,
+            )
+            tokens.append(encoded.tokens)
+            labels.append(encoded.labels)
+            vq_parts.extend(encoded.vq_parts)
+            mel_parts.extend(encoded.mel_parts)
+            vq_require_losses.extend([message.cal_loss] * len(encoded.vq_parts))
+        tokens = torch.cat(tokens, dim=0)
+        labels = torch.cat(labels, dim=0)
+        vq_require_losses = torch.tensor(vq_require_losses, dtype=torch.bool)
+        if add_shift:
+            tokens = tokens[:-1]
+            labels = labels[1:]
+        assert tokens.dtype in [
+            torch.int,
+            torch.long,
+        ], f"Invalid dtype: {tokens.dtype}, conv: {conversation}"
+        return EncodedMessage(
+            tokens=tokens,
+            labels=labels,
+            vq_parts=vq_parts,
+            mel_parts=mel_parts,
+            vq_require_losses=vq_require_losses,
+        )
+    def encode_for_inference(
+        self: "Conversation",
+        tokenizer: AutoTokenizer,
+        num_codebooks: int,
+    ) -> EncodedMessage:
+        encoded = self.encode(tokenizer, add_shift=False)
+        tokens = encoded.tokens
+        values = torch.zeros((num_codebooks + 1, len(tokens)), dtype=torch.int)
+        values[0] = tokens
+        if encoded.vq_parts is None or len(encoded.vq_parts) == 0:
+            return values
+        semantic_id, mel_id = tokenizer.convert_tokens_to_ids(
+            [SEMANTIC_TOKEN, MEL_TOKEN]
+        )
+        vq_parts = encoded.vq_parts
+        vq_parts = torch.cat(vq_parts, dim=1)
+        values[1:, tokens == semantic_id] = vq_parts
+        return values
+    def visualize(self: "Conversation", tokenizer: AutoTokenizer):
+        encoded = self.encode(tokenizer, add_shift=False)
+        print_in_blue = lambda x: print("\033[94m" + x + "\033[0m", end="")
+        print_in_green = lambda x: print("\033[92m" + x + "\033[0m", end="")
+        for tok, lab in zip(encoded.tokens, encoded.labels):
+            val = tokenizer.decode(tok, skip_special_tokens=False)
+            if val == "\n":
+                val = "\\n\n"
+            if lab == -100:
+                print_in_green(val)
+            else:
+                print_in_blue(val)
+        print()
+if __name__ == "__main__":
+    message0 = Message(
+        role="user",
+        parts=[
+            TextPart(text="Hello, how are you?"),
+            VQPart(codes=torch.zeros((4, 10))),
+        ],
+        cal_loss=False,
+    )
+    message1 = Message(
+        role="assistant",
+        parts=[TextPart(text="I'm fine, thank you.")],
+        cal_loss=True,
+    )
+    conversation = Conversation([message0, message1])
+    tokenizer = AutoTokenizer.from_pretrained("checkpoints/Qwen2-1.5B-Instruct")
+    conversation.visualize(tokenizer)
+    encoded = conversation.encode(tokenizer)
+    print(encoded)
+    print(tokenizer.batch_decode(encoded.tokens))

xinference/thirdparty/fish_speech/fish_speech/i18n/locale/en_US.json CHANGED Viewed

@@ -118,5 +118,6 @@
   "new": "new",
   "Realtime Transform Text": "Realtime Transform Text",
   "Normalization Result Preview (Currently Only Chinese)": "Normalization Result Preview (Currently Only Chinese)",
-  "Text Normalization": "Text Normalization"
+  "Text Normalization": "Text Normalization",
+  "Select Example Audio": "Select Example Audio"
 }

xinference/thirdparty/fish_speech/fish_speech/i18n/locale/es_ES.json CHANGED Viewed

@@ -118,5 +118,6 @@
   "new": "nuevo",
   "Realtime Transform Text": "Transformación de Texto en Tiempo Real",
   "Normalization Result Preview (Currently Only Chinese)": "Vista Previa del Resultado de Normalización (Actualmente Solo Chino)",
-  "Text Normalization": "Normalización de Texto"
+  "Text Normalization": "Normalización de Texto",
+  "Select Example Audio": "Selecionar áudio de exemplo"
 }

xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ja_JP.json CHANGED Viewed

@@ -118,6 +118,6 @@
   "new": "新規",
   "Realtime Transform Text": "リアルタイム変換テキスト",
   "Normalization Result Preview (Currently Only Chinese)": "正規化結果プレビュー（現在は中国語のみ）",
-  "Text Normalization": "テキスト正規化"
+  "Text Normalization": "テキスト正規化",
+  "Select Example Audio": "サンプル音声を選択"
 }

xinference/thirdparty/fish_speech/fish_speech/i18n/locale/ko_KR.json ADDED Viewed

@@ -0,0 +1,123 @@
+{
+  "16-mixed is recommended for 10+ series GPU": "10+ 시리즈 GPU에는 16-mixed를 권장합니다.",
+  "5 to 10 seconds of reference audio, useful for specifying speaker.": "화자를 특정하는 데 유의미한 5~10초의 길이의 참조 오디오 데이터.",
+  "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "[Fish Audio](https://fish.audio)에서 개발한 VQ-GAN 및 Llama 기반의 텍스트 음성 변환 모델.",
+  "Accumulate Gradient Batches": "그라디언트 배치 누적",
+  "Add to Processing Area": "처리 영역에 추가",
+  "Added path successfully!": "경로가 성공적으로 추가되었습니다!",
+  "Advanced Config": "고급 설정",
+  "Base LLAMA Model": "기본 LLAMA 모델",
+  "Batch Inference": "배치 추론",
+  "Batch Size": "배치 크기",
+  "Changing with the Model Path": "모델 경로에 따라 변경 중",
+  "Chinese": "중국어",
+  "Compile Model": "모델 컴파일",
+  "Compile the model can significantly reduce the inference time, but will increase cold start time": "모델을 컴파일하면 추론 시간이 크게 줄어들지만, 초기 시작 시간이 길어집니다.",
+  "Copy": "복사",
+  "Data Preprocessing": "데이터 전처리",
+  "Data Preprocessing Path": "데이터 전처리 경로",
+  "Data Source": "데이터 소스",
+  "Decoder Model Config": "디코더 모델 설정",
+  "Decoder Model Path": "디코더 모델 경로",
+  "Disabled": "비활성화 됨",
+  "Enable Reference Audio": "참고 음성 활성화",
+  "English": "영어",
+  "Error Message": "오류 메시지",
+  "File Preprocessing": "파일 전처리",
+  "Generate": "생성",
+  "Generated Audio": "생성된 오디오",
+  "If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "오디오애 대응하는 텍스트가 없을 경우, ASR을 적용해 지원하며, .txt 또는 .lab 형식을 지원합니다.",
+  "Infer interface is closed": "추론 인터페이스가 닫혔습니다.",
+  "Inference Configuration": "추론 설정",
+  "Inference Server Configuration": "추론 서버 설정",
+  "Inference Server Error": "추론 서버 오류",
+  "Inferring interface is launched at {}": "추론 인터페이스가 {}에서 시작되었습니다.",
+  "Initial Learning Rate": "초기 학습률",
+  "Input Audio & Source Path for Transcription": "전사할 입력 오디오 및 소스 경로",
+  "Input Text": "입력 텍스트",
+  "Invalid path: {}": "유효하지 않은 경로: {}",
+  "It is recommended to use CUDA, if you have low configuration, use CPU": "CUDA 사용을 권장하며, 낮은 사양일 경우 CPU를 사용하는 것을 권장합니다.",
+  "Iterative Prompt Length, 0 means off": "반복 프롬프트 길이. (0:비활성화)",
+  "Japanese": "일본어",
+  "LLAMA Configuration": "LLAMA 설정",
+  "LLAMA Model Config": "LLAMA 모델 설정",
+  "LLAMA Model Path": "LLAMA 모델 경로",
+  "Labeling Device": "라벨링 장치",
+  "LoRA Model to be merged": "병합할 LoRA 모델",
+  "Maximum Audio Duration": "최대 오디오 길이",
+  "Maximum Length per Sample": "샘플당 최대 길이",
+  "Maximum Training Steps": "최대 학습 단계",
+  "Maximum tokens per batch, 0 means no limit": "배치당 최대 토큰 수(0:제한 없음)",
+  "Merge": "병합",
+  "Merge LoRA": "LoRA 병합",
+  "Merge successfully": "성공적으로 병합 되었습니다.",
+  "Minimum Audio Duration": "최소 오디오 길이",
+  "Model Output Path": "모델 출력 경로",
+  "Model Size": "모델 크기",
+  "Move": "이동",
+  "Move files successfully": "파일이 성공적으로 이동되었습니다.",
+  "No audio generated, please check the input text.": "생성된 오디오가 없습니다. 입력된 텍스트를 확인하세요.",
+  "No selected options": "옵션이 선택되지 않았습니다.",
+  "Number of Workers": "작업자 수",
+  "Open Inference Server": "추론 서버 열기",
+  "Open Labeler WebUI": "라벨러 WebUI 열기",
+  "Open Tensorboard": "Tensorboard 열기",
+  "Opened labeler in browser": "브라우저에서 라벨러가 열렸습니다.",
+  "Optional Label Language": "선택적 라벨 언어",
+  "Optional online ver": "온라인 버전 선택",
+  "Output Path": "출력 경로",
+  "Path error, please check the model file exists in the corresponding path": "경로 오류, 해당 경로에 모델 파일이 있는지 확인하십시오.",
+  "Precision": "정밀도",
+  "Probability of applying Speaker Condition": "화자 조건 적용 확률",
+  "Put your text here.": "여기에 텍스트를 입력하세요.",
+  "Reference Audio": "참고 오디오",
+  "Reference Text": "참고 텍스트",
+  "Related code and weights are released under CC BY-NC-SA 4.0 License.": "관련 코드 및 가중치는 CC BY-NC-SA 4.0 라이선스 하에 배포됩니다.",
+  "Remove Selected Data": "선택한 데이터 제거",
+  "Removed path successfully!": "경로가 성공적으로 제거되었습니다!",
+  "Repetition Penalty": "반복 패널티",
+  "Save model every n steps": "n 단계마다 모델 저장",
+  "Select LLAMA ckpt": "LLAMA ckpt 선택",
+  "Select VITS ckpt": "VITS ckpt 선택",
+  "Select VQGAN ckpt": "VQGAN ckpt 선택",
+  "Select source file processing method": "소스 파일 처리 방법 선택",
+  "Select the model to be trained (Depending on the Tab page you are on)": "학습할 모델 선택(탭 페이지에 따라 다름)",
+  "Selected: {}": "선택됨: {}",
+  "Speaker": "화자",
+  "Speaker is identified by the folder name": "화자는 폴더 이름으로 식별됩니다",
+  "Start Training": "학습 시작",
+  "Streaming Audio": "스트리밍 오디오",
+  "Streaming Generate": "스트리밍 생성",
+  "Tensorboard Host": "Tensorboard 호스트",
+  "Tensorboard Log Path": "Tensorboard 로그 경로",
+  "Tensorboard Port": "Tensorboard 포트",
+  "Tensorboard interface is closed": "Tensorboard 인터페이스가 닫혔습니다",
+  "Tensorboard interface is launched at {}": "Tensorboard 인터페이스가 {}에서 시작되었습니다.",
+  "Text is too long, please keep it under {} characters.": "텍스트가 너무 깁니다. {}자 이하로 입력해주세요.",
+  "The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "왼쪽의 입력 폴더 경로 또는 파일 목록의 경로. 체크 여부에 관계없이 이 목록에서 후속 학습에 사용됩니다.",
+  "Training Configuration": "학습 설정",
+  "Training Error": "학습 오류",
+  "Training stopped": "학습이 중지되었습니다.",
+  "Type name of the speaker": "화자의 이름을 입력하세요.",
+  "Type the path or select from the dropdown": "경로를 입력하거나 드롭다운에서 선택하세요.",
+  "Use LoRA": "LoRA 사용",
+  "Use LoRA can save GPU memory, but may reduce the quality of the model": "LoRA를 사용하면 GPU 메모리를 절약할 수 있지만, 모델의 품질이 저하될 수 있습니다.",
+  "Use filelist": "파일 목록 사용",
+  "Use large for 10G+ GPU, medium for 5G, small for 2G": "10G+ GPU 환경에선 large, 5G에선 medium, 2G에선 small을 사용할 것을 권장합니다.",
+  "VITS Configuration": "VITS 설정",
+  "VQGAN Configuration": "VQGAN 설정",
+  "Validation Batch Size": "검증 배치 크기",
+  "View the status of the preprocessing folder (use the slider to control the depth of the tree)": "전처리 폴더의 상태를 확인합니다(슬라이더를 사용하여 트리의 깊이를 조절합니다)",
+  "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "모델의 오용에 대해 책임지지 않습니다. 사용하기 전에 현지 법률과 규정을 고려하시길 바랍니다.",
+  "WebUI Host": "WebUI 호스트",
+  "WebUI Port": "WebUI 포트",
+  "Whisper Model": "Whisper 모델",
+  "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "소스 코드는 [이곳](https://github.com/fishaudio/fish-speech)에서, 모델은 [이곳](https://huggingface.co/fishaudio/fish-speech-1)에서 확인하실 수 있습니다.",
+  "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "30+ 시리즈 GPU에는 bf16-true를, 10+ 시리즈 GPU에는 16-mixed를 권장합니다",
+  "latest": "최신",
+  "new": "새로운",
+  "Realtime Transform Text": "실시간 텍스트 변환",
+  "Normalization Result Preview (Currently Only Chinese)": "정규화 결과 미리보기(현재 중국어만 지원)",
+  "Text Normalization": "텍스트 정규화",
+  "Select Example Audio": "예시 오디오 선택"
+}

xinference/thirdparty/fish_speech/fish_speech/i18n/locale/zh_CN.json CHANGED Viewed

@@ -118,5 +118,6 @@
   "new": "创建新的检查点",
   "Realtime Transform Text": "实时规范化文本",
   "Normalization Result Preview (Currently Only Chinese)": "规范化结果预览",
-  "Text Normalization": "文本规范化"
+  "Text Normalization": "文本规范化",
+  "Select Example Audio": "选择参考音频"
 }

xinference/thirdparty/fish_speech/fish_speech/models/text2semantic/llama.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import dataclasses
 import json
 import math
 from collections import OrderedDict
@@ -57,6 +58,10 @@ class BaseModelArgs:
     # Initialize the model
     initializer_range: float = 0.02
+    # Dummy vars
+    is_reward_model: bool = False
+    share_codebook_embeddings: bool = True
     def __post_init__(self):
         if self.n_local_heads == -1:
             self.n_local_heads = self.n_head
@@ -100,6 +105,28 @@ class NaiveModelArgs(BaseModelArgs):
 class DualARModelArgs(BaseModelArgs):
     model_type: str = "dual_ar"
     n_fast_layer: int = 4
+    fast_dim: int | None = None
+    fast_n_head: int | None = None
+    fast_n_local_heads: int | None = None
+    fast_head_dim: int | None = None
+    fast_intermediate_size: int | None = None
+    fast_attention_qkv_bias: bool | None = None
+    def __post_init__(self):
+        super().__post_init__()
+        self.fast_dim = self.fast_dim or self.dim
+        self.fast_n_head = self.fast_n_head or self.n_head
+        self.fast_n_local_heads = self.fast_n_local_heads or self.n_local_heads
+        self.fast_head_dim = self.fast_head_dim or self.head_dim
+        self.fast_intermediate_size = (
+            self.fast_intermediate_size or self.intermediate_size
+        )
+        self.fast_attention_qkv_bias = (
+            self.fast_attention_qkv_bias
+            if self.fast_attention_qkv_bias is not None
+            else self.attention_qkv_bias
+        )
 class KVCache(nn.Module):
@@ -369,7 +396,10 @@ class BaseTransformer(nn.Module):
                 model = simple_quantizer.convert_for_runtime()
             weights = torch.load(
-                Path(path) / "model.pth", map_location="cpu", mmap=True
+                Path(path) / "model.pth",
+                map_location="cpu",
+                mmap=True,
+                weights_only=True,
             )
             if "state_dict" in weights:
@@ -471,20 +501,46 @@ class DualARTransformer(BaseTransformer):
     def __init__(self, config: NaiveModelArgs, tokenizer: AutoTokenizer) -> None:
         super().__init__(config, init_weights=False, tokenizer=tokenizer)
+        # Project to fast dim if needed
+        if config.fast_dim is not None and config.fast_dim != config.dim:
+            self.fast_project_in = nn.Linear(config.dim, config.fast_dim)
+        else:
+            self.fast_project_in = nn.Identity()
         # Fast transformer
-        self.fast_embeddings = nn.Embedding(config.codebook_size, config.dim)
+        self.fast_embeddings = nn.Embedding(config.codebook_size, config.fast_dim)
         # The equivalent bs is so large that sdpa doesn't work
+        override_config = dataclasses.replace(
+            config,
+            dim=config.fast_dim,
+            n_head=config.fast_n_head,
+            n_local_heads=config.fast_n_local_heads,
+            head_dim=config.fast_head_dim,
+            intermediate_size=config.fast_intermediate_size,
+            attention_qkv_bias=config.fast_attention_qkv_bias,
+        )
         self.fast_layers = nn.ModuleList(
-            TransformerBlock(config, use_sdpa=False) for _ in range(config.n_fast_layer)
+            TransformerBlock(override_config, use_sdpa=False)
+            for _ in range(config.n_fast_layer)
         )
-        self.fast_norm = RMSNorm(config.dim, eps=config.norm_eps)
+        self.fast_norm = RMSNorm(config.fast_dim, eps=config.norm_eps)
         self.fast_output = nn.Linear(
-            config.dim,
+            config.fast_dim,
             config.codebook_size,
             bias=False,
         )
+        self.register_buffer(
+            "fast_freqs_cis",
+            precompute_freqs_cis(
+                config.num_codebooks,
+                config.fast_dim // config.fast_n_head,
+                config.rope_base,
+            ),
+            persistent=False,
+        )
         self.apply(self._init_weights)
     def setup_caches(
@@ -492,7 +548,7 @@ class DualARTransformer(BaseTransformer):
     ):
         super().setup_caches(max_batch_size, max_seq_len, dtype)
-        head_dim = self.config.dim // self.config.n_head
+        head_dim = self.config.fast_dim // self.config.fast_n_head
         # Fast transformer
         # The max seq len here is the number of codebooks
@@ -500,7 +556,7 @@ class DualARTransformer(BaseTransformer):
             b.attention.kv_cache = KVCache(
                 max_batch_size,
                 self.config.num_codebooks,
-                self.config.n_local_heads,
+                self.config.fast_n_local_heads,
                 head_dim,
                 dtype=dtype,
             )
@@ -513,13 +569,13 @@ class DualARTransformer(BaseTransformer):
         parent_result = super().forward(inp, key_padding_mask)
         token_logits = parent_result.logits
         x = parent_result.hidden_states
+        x = self.fast_project_in(x)
         # Fast transformer
         fast_seq_len = self.config.num_codebooks
         fast_mask = self.causal_mask[
             None, None, :fast_seq_len, :fast_seq_len
         ]  # (B, N, Q, K)
-        fast_freqs_cis = self.freqs_cis[:fast_seq_len]
         # Drop the last token and rotate left
         codebooks = inp[:, 1:-1, 1:]
@@ -542,9 +598,11 @@ class DualARTransformer(BaseTransformer):
         for layer in self.fast_layers:
             if self.config.use_gradient_checkpointing and self.training:
-                x = checkpoint(layer, x, fast_freqs_cis, fast_mask, use_reentrant=True)
+                x = checkpoint(
+                    layer, x, self.fast_freqs_cis, fast_mask, use_reentrant=True
+                )
             else:
-                x = layer(x, fast_freqs_cis, fast_mask)
+                x = layer(x, self.fast_freqs_cis, fast_mask)
         # unflatten the batch and num_codebooks
         fast_out = self.fast_norm(x)
@@ -584,7 +642,7 @@ class DualARTransformer(BaseTransformer):
         fast_mask = self.causal_mask[
             None, None, input_pos, : self.config.num_codebooks
         ]  # (B, N, Q, K)
-        fast_freqs_cis = self.freqs_cis[input_pos]
+        fast_freqs_cis = self.fast_freqs_cis[input_pos]
         for layer in self.fast_layers:
             x = layer(x, fast_freqs_cis, fast_mask, input_pos=input_pos)
@@ -595,6 +653,13 @@ class DualARTransformer(BaseTransformer):
         return codebook_logits
+    def forward_generate(
+        self, x: Tensor, input_pos: Optional[Tensor] = None
+    ) -> TransformerForwardResult:
+        x = super().forward_generate(x, input_pos)
+        x.hidden_states = self.fast_project_in(x.hidden_states)
+        return x
 class TransformerBlock(nn.Module):
     def __init__(self, config: BaseModelArgs, use_sdpa: bool = True) -> None:

xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/firefly.py CHANGED Viewed

@@ -102,8 +102,8 @@ class FishConvNet(nn.Module):
         self.conv = weight_norm(self.conv, name=name, dim=dim)
         return self
-    def remove_weight_norm(self):
-        self.conv = remove_parametrizations(self.conv)
+    def remove_parametrizations(self, name="weight"):
+        self.conv = remove_parametrizations(self.conv, name)
         return self
@@ -128,8 +128,8 @@ class FishTransConvNet(nn.Module):
         self.conv = weight_norm(self.conv, name=name, dim=dim)
         return self
-    def remove_weight_norm(self):
-        self.conv = remove_parametrizations(self.conv)
+    def remove_parametrizations(self, name="weight"):
+        self.conv = remove_parametrizations(self.conv, name)
         return self
@@ -178,9 +178,9 @@ class ResBlock1(torch.nn.Module):
     def remove_parametrizations(self):
         for conv in self.convs1:
-            remove_parametrizations(conv, tensor_name="weight")
+            conv.remove_parametrizations()
         for conv in self.convs2:
-            remove_parametrizations(conv, tensor_name="weight")
+            conv.remove_parametrizations()
 class ParallelBlock(nn.Module):
@@ -288,11 +288,11 @@ class HiFiGANGenerator(nn.Module):
     def remove_parametrizations(self):
         for up in self.ups:
-            remove_parametrizations(up, tensor_name="weight")
+            up.remove_parametrizations()
         for block in self.resblocks:
             block.remove_parametrizations()
-        remove_parametrizations(self.conv_pre, tensor_name="weight")
-        remove_parametrizations(self.conv_post, tensor_name="weight")
+        self.conv_pre.remove_parametrizations()
+        self.conv_post.remove_parametrizations()
 # DropPath copied from timm library

xinference/thirdparty/fish_speech/fish_speech/models/vqgan/modules/fsq.py CHANGED Viewed

@@ -99,7 +99,7 @@ class DownsampleFiniteScalarQuantize(nn.Module):
         if diff > 0:
             result.z = F.pad(result.z, (left, right))
         elif diff < 0:
-            result.z = result.z[..., left:-right]
+            result.z = result.z[..., -left:right]
         return result

xinference 0.16.2__py3-none-any.whl → 1.0.0__py3-none-any.whl

Potentially problematic release.

xinference 0.16.2py3-none-any.whl → 1.0.0py3-none-any.whl